From 3d0a1ba61faef66e2136aae130a908228135ceaa Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 29 Jan 2025 19:43:40 +0000
Subject: [PATCH] Add validators for bboxes annotation files (#32)

* Add json schemas

* Add validators for VIA and COCO files

* Update MANIFEST

* Add tests for supported validators

* Combine validators tests

* Simplify JSON check

* Delete placeholder

* Rename schemas

* Small edits caps

* Update docstrings

* Incorporate Niko's comments

* Remove spell check suggestion from error message

* Combine get_default_schema into 1

* Clarify init=False

* Missing verb

* log_message --> error_message

* A simplified version of _extract_properties_keys based on Niko's
---
 MANIFEST.in                                   |   4 +
 ethology/annotations/json_schemas/__init__.py |   0
 .../json_schemas/schemas/COCO_schema.json     |  78 ++++
 .../json_schemas/schemas/README.md            |  32 ++
 .../json_schemas/schemas/VIA_schema.json      |  88 ++++
 ethology/annotations/json_schemas/utils.py    | 130 ++++++
 ethology/annotations/validators.py            | 193 +++++++++
 .../test_annotations/test_placeholder.py      |   2 -
 .../test_annotations/test_validators.py       | 408 ++++++++++++++++++
 9 files changed, 933 insertions(+), 2 deletions(-)
 create mode 100644 ethology/annotations/json_schemas/__init__.py
 create mode 100644 ethology/annotations/json_schemas/schemas/COCO_schema.json
 create mode 100644 ethology/annotations/json_schemas/schemas/README.md
 create mode 100644 ethology/annotations/json_schemas/schemas/VIA_schema.json
 create mode 100644 ethology/annotations/json_schemas/utils.py
 create mode 100644 ethology/annotations/validators.py
 delete mode 100644 tests/test_unit/test_annotations/test_placeholder.py
 create mode 100644 tests/test_unit/test_annotations/test_validators.py

diff --git a/MANIFEST.in b/MANIFEST.in
index e16ea33..63adff3 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -6,3 +6,7 @@ recursive-exclude * __pycache__
 recursive-exclude * *.py[co]
 recursive-exclude docs *
 recursive-exclude tests *
+
+# Include json schemas
+recursive-include ethology/annotations/json_schemas/schemas *.json
+recursive-include ethology/annotations/json_schemas/schemas *.md
diff --git a/ethology/annotations/json_schemas/__init__.py b/ethology/annotations/json_schemas/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/ethology/annotations/json_schemas/schemas/COCO_schema.json b/ethology/annotations/json_schemas/schemas/COCO_schema.json
new file mode 100644
index 0000000..3793027
--- /dev/null
+++ b/ethology/annotations/json_schemas/schemas/COCO_schema.json
@@ -0,0 +1,78 @@
+{
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "type": "object",
+    "properties": {
+        "info": {
+            "type": "object"
+        },
+        "licenses": {
+            "type": "array"
+        },
+        "images": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "file_name": {
+                        "type": "string"
+                    },
+                    "id": {
+                        "type": "integer"
+                    },
+                    "width": {
+                        "type": "integer"
+                    },
+                    "height": {
+                        "type": "integer"
+                    }
+                }
+            }
+        },
+        "annotations": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "id": {
+                        "type": "integer"
+                    },
+                    "image_id": {
+                        "type": "integer"
+                    },
+                    "bbox": {
+                        "type": "array",
+                        "items": {
+                            "type": "integer"
+                        }
+                    },
+                    "category_id": {
+                        "type": "integer"
+                    },
+                    "area": {
+                        "type": "number"
+                    },
+                    "iscrowd": {
+                        "type": "integer"
+                    }
+                }
+            }
+        },
+        "categories": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "id": {
+                        "type": "integer"
+                    },
+                    "name": {
+                        "type": "string"
+                    },
+                    "supercategory": {
+                        "type": "string"
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/ethology/annotations/json_schemas/schemas/README.md b/ethology/annotations/json_schemas/schemas/README.md
new file mode 100644
index 0000000..5976fb2
--- /dev/null
+++ b/ethology/annotations/json_schemas/schemas/README.md
@@ -0,0 +1,32 @@
+## JSON schemas for manual annotations files.
+
+We use JSON schemas to validate the types of a supported annotation file.
+
+Note that the schema validation only checks the type of a key if that key is present. It does not check for the presence of the keys.
+
+If the meta-schema (under $schema) is not provided, the jsonschema validator uses the the latest released draft of the JSON schema specification.
+
+## VIA schema
+
+The VIA schema corresponds to the format exported by VGG Image Annotator 2.x.y (VIA) for object detection annotations.
+
+Each image under `_via_img_metadata` is indexed using a unique key: FILENAME-FILESIZE. We use "additionalProperties" to allow for any key name, see https://stackoverflow.com/a/69811612/24834957.
+
+The section `_via_image_id_list` contains an ordered list of image keys using a unique key: `FILENAME-FILESIZE`, the position in the list defines the image ID.
+
+The section `_via_attributes` contains region attributes and file attributes, to display in VIA's UI and to classify the data.
+
+The section `_via_data_format_version` contains the version of the VIA tool used.
+
+
+## COCO schema
+The COCO schema follows the COCO dataset format for object detection, see https://cocodataset.org/#format-data.
+
+Box coordinates are measured from the top left corner of the image, and are 0-indexed.
+### References
+----------
+- https://github.com/python-jsonschema/jsonschema
+- https://json-schema.org/understanding-json-schema/
+- https://cocodataset.org/#format-data
+- https://gitlab.com/vgg/via/-/blob/master/via-2.x.y/CodeDoc.md?ref_type=heads#description-of-via-project-json-file
+- https://python-jsonschema.readthedocs.io/en/stable/api/#jsonschema.validate
diff --git a/ethology/annotations/json_schemas/schemas/VIA_schema.json b/ethology/annotations/json_schemas/schemas/VIA_schema.json
new file mode 100644
index 0000000..8017a90
--- /dev/null
+++ b/ethology/annotations/json_schemas/schemas/VIA_schema.json
@@ -0,0 +1,88 @@
+{
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "type": "object",
+    "properties": {
+        "_via_settings": {
+            "type": "object",
+            "properties": {
+                "ui": {
+                    "type": "object"
+                },
+                "core": {
+                    "type": "object"
+                },
+                "project": {
+                    "type": "object"
+                }
+            }
+        },
+        "_via_img_metadata": {
+            "type": "object",
+            "additionalProperties": {
+                "type": "object",
+                "properties": {
+                    "filename": {
+                        "type": "string"
+                    },
+                    "size": {
+                        "type": "integer"
+                    },
+                    "regions": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "properties": {
+                                "shape_attributes": {
+                                    "type": "object",
+                                    "properties": {
+                                        "name": {
+                                            "type": "string"
+                                        },
+                                        "x": {
+                                            "type": "integer"
+                                        },
+                                        "y": {
+                                            "type": "integer"
+                                        },
+                                        "width": {
+                                            "type": "integer"
+                                        },
+                                        "height": {
+                                            "type": "integer"
+                                        }
+                                    }
+                                },
+                                "region_attributes": {
+                                    "type": "object"
+                                }
+                            }
+                        }
+                    },
+                    "file_attributes": {
+                        "type": "object"
+                    }
+                }
+            }
+        },
+        "_via_image_id_list": {
+            "type": "array",
+            "items": {
+                "type": "string"
+            }
+        },
+        "_via_attributes": {
+            "type": "object",
+            "properties": {
+                "region": {
+                    "type": "object"
+                },
+                "file": {
+                    "type": "object"
+                }
+            }
+        },
+        "_via_data_format_version": {
+            "type": "string"
+        }
+    }
+}
diff --git a/ethology/annotations/json_schemas/utils.py b/ethology/annotations/json_schemas/utils.py
new file mode 100644
index 0000000..84ca7ce
--- /dev/null
+++ b/ethology/annotations/json_schemas/utils.py
@@ -0,0 +1,130 @@
+"""Utility functions for JSON schema files."""
+
+import json
+from pathlib import Path
+
+import jsonschema
+
+
+def _get_default_schema(schema_name: str) -> dict:
+    """Get the default VIA or COCO schema as a dictionary."""
+    schema_path = (
+        Path(__file__).parent / "schemas" / f"{schema_name}_schema.json"
+    )
+    with open(schema_path) as file:
+        schema_dict = json.load(file)
+    return schema_dict
+
+
+def _check_file_is_json(filepath: Path):
+    """Check the input file can be read as a JSON."""
+    try:
+        with open(filepath) as file:
+            json.load(file)
+    except json.JSONDecodeError as decode_error:
+        # We override the error message for clarity
+        raise ValueError(
+            f"Error decoding JSON data from file: {filepath}. "
+            "The data being deserialized is not a valid JSON. "
+        ) from decode_error
+
+
+def _check_file_matches_schema(filepath: Path, schema: dict | None):
+    """Check the input JSON file matches the given schema.
+
+    The schema validation only checks the type for each specified
+    key if the key exists. It does not check that the keys in the
+    schema are present in the JSON file.
+    """
+    # Read json file
+    with open(filepath) as file:
+        data = json.load(file)
+
+    # Check against schema if provided
+    if schema:
+        jsonschema.validate(instance=data, schema=schema)
+
+
+def _check_required_properties_keys(
+    required_properties_keys: list, schema: dict
+):
+    """Check the input schema includes the required "properties" keys."""
+    # Get keys of "properties" dictionaries in schema
+    properties_keys_in_schema = _extract_properties_keys(schema)
+
+    # Get list of "properties" keys that are required but not in schema
+    missing_keys = set(required_properties_keys) - set(
+        properties_keys_in_schema
+    )
+
+    # Raise error if there are missing keys in the schema
+    if missing_keys:
+        raise ValueError(
+            f"Required key(s) {sorted(missing_keys)} not found " "in schema."
+        )
+
+
+def _check_required_keys_in_dict(
+    list_required_keys: list[str],
+    data: dict,
+    additional_message: str = "",
+):
+    """Check if the required keys are present in the input dictionary."""
+    missing_keys = set(list_required_keys) - set(data.keys())
+    if missing_keys:
+        raise ValueError(
+            f"Required key(s) {sorted(missing_keys)} not "
+            f"found{additional_message}."
+        )
+
+
+def _extract_properties_keys(input_schema: dict, prefix: str = "") -> list:
+    """Extract keys from all "properties" subdictionaries in a JSON schema.
+
+    Recursively extract the keys of all subdictionaries in the input
+    dictionary that are values to a "properties" key. The input dictionary
+    represents a JSON schema dictionary
+    (see https://json-schema.org/understanding-json-schema/about). The output
+    is a sorted list of strings with full paths (e.g. 'parent/child').
+
+    The "properties" key always appears as part of a set of dictionary keys
+    with at least another key being "type" or "item". We use this to find the
+    relevant subdictionaries.
+
+    """
+    result: list[str] = []
+
+    # Skip if "type" key is missing in the schema
+    if "type" not in input_schema:
+        return result
+
+    # If the input dictionary has a "properties" key: extract keys
+    # and recurse into nested dictionaries
+    if "properties" in input_schema:
+        for key, value in input_schema["properties"].items():
+            full_key = f"{prefix}/{key}" if prefix else key
+            result.append(full_key)
+            # Recurse into nested dictionaries to look for more "properties"
+            # dicts
+            result.extend(_extract_properties_keys(value, full_key))
+
+    # If dictionary has "additionalProperties" key: recurse into it
+    if "additionalProperties" in input_schema:
+        result.extend(
+            _extract_properties_keys(
+                input_schema["additionalProperties"],
+                prefix,
+            )
+        )
+
+    # If dictionary has "items" key: recurse into it
+    if "items" in input_schema:
+        result.extend(
+            _extract_properties_keys(
+                input_schema["items"],
+                prefix,
+            )
+        )
+
+    # Return sorted list of keys with full paths
+    return sorted(result)
diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py
new file mode 100644
index 0000000..e830ce8
--- /dev/null
+++ b/ethology/annotations/validators.py
@@ -0,0 +1,193 @@
+"""Validators for supported annotation files."""
+
+import json
+from pathlib import Path
+
+from attrs import define, field
+
+from ethology.annotations.json_schemas.utils import (
+    _check_file_is_json,
+    _check_file_matches_schema,
+    _check_required_keys_in_dict,
+    _get_default_schema,
+)
+
+
+@define
+class ValidVIA:
+    """Class for valid VIA JSON files.
+
+    It checks the input file is a valid JSON file, matches
+    the VIA schema and contains the required keys.
+
+
+    Attributes
+    ----------
+    path : pathlib.Path
+        Path to the VIA JSON file, passed as an input.
+    schema : dict
+        The JSON schema is set to the default VIA schema.
+    required_keys : dict
+        The required keys for the VIA JSON file.
+
+    Raises
+    ------
+    ValueError
+        If the JSON file cannot be decoded.
+    jsonschema.exceptions.ValidationError
+        If the type of any of the keys in the JSON file
+        does not match the type specified in the schema.
+    jsonschema.exceptions.SchemaError
+        If the schema is invalid.
+    ValueError
+        If the VIA JSON file is missing any of the required keys.
+
+    """
+
+    path: Path = field()
+    schema: dict = field(
+        default=_get_default_schema("VIA"),
+        init=False,
+    )
+    required_keys: dict = field(
+        default={
+            "main": ["_via_img_metadata", "_via_image_id_list"],
+            "images": ["filename", "regions"],
+            "regions": ["shape_attributes", "region_attributes"],
+            "shape_attributes": ["x", "y", "width", "height"],
+        },
+        init=False,
+        # with init=False the attribute is always initialized
+        # with the default value
+    )
+
+    # Note: the validators are applied in order
+    @path.validator
+    def _file_is_json(self, attribute, value):
+        _check_file_is_json(value)
+
+    @path.validator
+    def _file_matches_JSON_schema(self, attribute, value):
+        _check_file_matches_schema(value, self.schema)
+
+    @path.validator
+    def _file_contains_required_keys(self, attribute, value):
+        """Ensure that the VIA JSON file contains the required keys."""
+        # Read data as dict
+        with open(value) as file:
+            data = json.load(file)
+
+        # Check first level keys
+        _check_required_keys_in_dict(self.required_keys["main"], data)
+
+        # Check keys in nested dicts
+        for img_str, img_dict in data["_via_img_metadata"].items():
+            # Check keys for each image dictionary
+            _check_required_keys_in_dict(
+                self.required_keys["images"],
+                img_dict,
+                additional_message=f" for {img_str}",
+            )
+
+            # Check keys for each region in an image
+            for i, region in enumerate(img_dict["regions"]):
+                # Check keys under first level per region
+                _check_required_keys_in_dict(
+                    self.required_keys["regions"],
+                    region,
+                    additional_message=f" for region {i} under {img_str}",
+                )
+
+                # Check keys under "shape_attributes" per region
+                _check_required_keys_in_dict(
+                    self.required_keys["shape_attributes"],
+                    region["shape_attributes"],
+                    additional_message=f" for region {i} under {img_str}",
+                )
+
+
+@define
+class ValidCOCO:
+    """Class for valid COCO JSON files.
+
+    It checks the input file is a valid JSON file, matches
+    the COCO schema and contains the required keys.
+
+    Attributes
+    ----------
+    path : pathlib.Path
+        Path to the COCO JSON file, passed as an input.
+    schema : dict
+        The JSON schema is set to the default COCO schema.
+    required_keys : dict
+        The required keys for the COCO JSON file.
+
+    Raises
+    ------
+    ValueError
+        If the JSON file cannot be decoded.
+    jsonschema.exceptions.ValidationError
+        If the type of any of the keys in the JSON file
+        does not match the type specified in the schema.
+    jsonschema.exceptions.SchemaError
+        If the schema is invalid.
+    ValueError
+        If the COCO JSON file is missing any of the required keys.
+
+    """
+
+    path: Path = field()
+    schema: dict = field(
+        default=_get_default_schema("COCO"),
+        init=False,
+        # with init=False the attribute is always initialized
+        # with the default value
+    )
+
+    # The keys of "required_keys" match the 1st level keys in a COCO JSON file
+    required_keys: dict = field(
+        default={
+            "main": ["images", "annotations", "categories"],
+            "images": ["id", "file_name"],
+            "annotations": ["id", "image_id", "bbox", "category_id"],
+            "categories": ["id", "name", "supercategory"],
+        },
+        init=False,
+    )
+
+    # Note: the validators are applied in order
+    @path.validator
+    def _file_is_json(self, attribute, value):
+        _check_file_is_json(value)
+
+    @path.validator
+    def _file_matches_JSON_schema(self, attribute, value):
+        _check_file_matches_schema(value, self.schema)
+
+    @path.validator
+    def _file_contains_required_keys(self, attribute, value):
+        """Ensure that the COCO JSON file contains the required keys."""
+
+        # Helper function to singularise the input key for the
+        # error message
+        def _singularise_err_msg(key):
+            return key[:-1] if key != "categories" else key[:-3] + "y"
+
+        # Read file as dict
+        with open(value) as file:
+            data = json.load(file)
+
+        # Check first level keys
+        _check_required_keys_in_dict(self.required_keys["main"], data)
+
+        # Check keys in every dict listed under the "images", "annotations"
+        # and "categories" keys
+        for ky in list(self.required_keys.keys())[1:]:
+            for instance_dict in data[ky]:
+                _check_required_keys_in_dict(
+                    self.required_keys[ky],
+                    instance_dict,
+                    additional_message=(
+                        f" for {_singularise_err_msg(ky)} {instance_dict}"
+                    ),
+                )
diff --git a/tests/test_unit/test_annotations/test_placeholder.py b/tests/test_unit/test_annotations/test_placeholder.py
deleted file mode 100644
index 3ada1ee..0000000
--- a/tests/test_unit/test_annotations/test_placeholder.py
+++ /dev/null
@@ -1,2 +0,0 @@
-def test_placeholder():
-    assert True
diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py
new file mode 100644
index 0000000..07ac436
--- /dev/null
+++ b/tests/test_unit/test_annotations/test_validators.py
@@ -0,0 +1,408 @@
+import json
+from contextlib import nullcontext as does_not_raise
+from pathlib import Path
+
+import jsonschema
+import pytest
+
+from ethology.annotations.json_schemas.utils import (
+    _check_required_keys_in_dict,
+    _check_required_properties_keys,
+    _extract_properties_keys,
+)
+from ethology.annotations.validators import ValidCOCO, ValidVIA
+
+
+@pytest.fixture()
+def json_file_decode_error(tmp_path: Path) -> Path:
+    """Return path to a JSON file with a decoding error."""
+    json_file = tmp_path / "JSON_decode_error.json"
+    with open(json_file, "w") as f:
+        f.write("just-a-string")
+    return json_file
+
+
+@pytest.fixture()
+def json_file_not_found_error(tmp_path: Path) -> Path:
+    """Return path to a JSON file that does not exist."""
+    return tmp_path / "JSON_file_not_found.json"
+
+
+@pytest.fixture()
+def VIA_file_schema_mismatch(
+    annotations_test_data: dict,
+    tmp_path: Path,
+) -> Path:
+    """Return path to a VIA JSON file that does not match its schema.
+
+    Specifically, we modify the type of the "width" of the first bounding box
+    in the first image, from "int" to "str"
+    """
+    # Read valid JSON file
+    valid_VIA_file_sample_1 = annotations_test_data["VIA_JSON_sample_1.json"]
+    with open(valid_VIA_file_sample_1) as f:
+        data = json.load(f)
+
+    # Modify file so that it doesn't match the corresponding schema
+    # (make width a string)
+    _, img_dict = list(data["_via_img_metadata"].items())[0]
+    img_dict["regions"][0]["shape_attributes"]["width"] = "49"
+
+    # Save the modified JSON to a new file
+    out_json = tmp_path / f"{valid_VIA_file_sample_1.stem}_schema_error.json"
+    with open(out_json, "w") as f:
+        json.dump(data, f)
+    return out_json
+
+
+@pytest.fixture()
+def COCO_file_schema_mismatch(
+    annotations_test_data: dict,
+    tmp_path: Path,
+) -> Path:
+    """Return path to a COCO JSON file that doesn't match its schema.
+
+    Specifically, we modify the type of the object under the "annotations"
+    key from "list of dicts" to "list"
+    """
+    # Read valid JSON file
+    valid_COCO_file_sample_1 = annotations_test_data["COCO_JSON_sample_1.json"]
+    with open(valid_COCO_file_sample_1) as f:
+        data = json.load(f)
+
+    # Modify file so that it doesn't match the corresponding schema
+    data["annotations"] = [1, 2, 3]  # [d] for d in data["annotations"]]
+
+    # save the modified json to a new file
+    out_json = tmp_path / f"{valid_COCO_file_sample_1.stem}_schema_error.json"
+    with open(out_json, "w") as f:
+        json.dump(data, f)
+    return out_json
+
+
+@pytest.fixture()
+def small_schema() -> dict:
+    """Small schema with properties keys:
+    ["a", "b", "b/b1", "c", "c/c1", "c/c2"].
+    """
+    return {
+        "type": "object",
+        "properties": {
+            "a": {
+                "type": "array",
+                "items": {"type": "string"},
+            },
+            "b": {
+                "type": "object",
+                "properties": {"b1": {"type": "string"}},
+            },
+            "c": {
+                "type": "object",
+                "properties": {
+                    "c1": {"type": "string"},
+                    "c2": {"type": "string"},
+                },
+            },
+        },
+    }
+
+
+@pytest.fixture()
+def default_VIA_schema() -> dict:
+    """Get default VIA schema."""
+    from ethology.annotations.json_schemas.utils import _get_default_schema
+
+    return _get_default_schema("VIA")
+
+
+@pytest.fixture()
+def default_COCO_schema() -> dict:
+    """Get default COCO schema."""
+    from ethology.annotations.json_schemas.utils import (
+        _get_default_schema,
+    )
+
+    return _get_default_schema("COCO")
+
+
+@pytest.mark.parametrize(
+    "input_file, validator",
+    [
+        ("VIA_JSON_sample_1.json", ValidVIA),
+        ("VIA_JSON_sample_2.json", ValidVIA),
+        ("COCO_JSON_sample_1.json", ValidCOCO),
+        ("COCO_JSON_sample_2.json", ValidCOCO),
+    ],
+)
+def test_validators_valid_input_files(
+    input_file: str,
+    validator: type[ValidVIA | ValidCOCO],
+    annotations_test_data: dict,
+):
+    """Test the file validator with valid inputs."""
+    filepath = annotations_test_data[input_file]
+    with does_not_raise():
+        validator(path=filepath)
+
+
+@pytest.mark.parametrize(
+    "invalid_input_file, validator, expected_exception, error_message",
+    [
+        (
+            "json_file_decode_error",
+            ValidVIA,
+            pytest.raises(ValueError),
+            "Error decoding JSON data from file",
+        ),
+        (
+            "json_file_not_found_error",
+            ValidVIA,
+            pytest.raises(FileNotFoundError),
+            "No such file or directory: ",
+        ),
+        (
+            "json_file_decode_error",
+            ValidCOCO,
+            pytest.raises(ValueError),
+            "Error decoding JSON data from file",
+        ),
+        (
+            "json_file_not_found_error",
+            ValidCOCO,
+            pytest.raises(FileNotFoundError),
+            "No such file or directory: ",
+        ),
+        (
+            "VIA_file_schema_mismatch",
+            ValidVIA,
+            pytest.raises(jsonschema.exceptions.ValidationError),
+            "'49' is not of type 'integer'",
+        ),
+        (
+            "COCO_file_schema_mismatch",
+            ValidCOCO,
+            pytest.raises(jsonschema.exceptions.ValidationError),
+            "3 is not of type 'object'",
+        ),
+    ],
+)
+def test_validators_invalid_input_files(
+    invalid_input_file: str,
+    validator: type[ValidVIA | ValidCOCO],
+    expected_exception: pytest.raises,
+    error_message: str,
+    request: pytest.FixtureRequest,
+):
+    """Test the validators throw the expected errors when passed invalid
+    inputs.
+    """
+    invalid_json_file = request.getfixturevalue(invalid_input_file)
+
+    with expected_exception as excinfo:
+        validator(path=invalid_json_file)
+
+    # Check that the error message contains expected string
+    assert error_message in str(excinfo.value)
+
+    # Check the error message contains file path
+    if not isinstance(excinfo.value, jsonschema.exceptions.ValidationError):
+        assert invalid_json_file.name in str(excinfo.value)
+
+
+@pytest.mark.parametrize(
+    "schema, expected_properties_keys",
+    [
+        ("small_schema", ["a", "b", "b/b1", "c", "c/c1", "c/c2"]),
+        (
+            "default_VIA_schema",
+            [
+                "_via_attributes",
+                "_via_attributes/file",
+                "_via_attributes/region",
+                "_via_data_format_version",
+                "_via_image_id_list",
+                "_via_img_metadata",
+                "_via_img_metadata/file_attributes",
+                "_via_img_metadata/filename",
+                "_via_img_metadata/regions",
+                "_via_img_metadata/regions/region_attributes",
+                "_via_img_metadata/regions/shape_attributes",
+                "_via_img_metadata/regions/shape_attributes/height",
+                "_via_img_metadata/regions/shape_attributes/name",
+                "_via_img_metadata/regions/shape_attributes/width",
+                "_via_img_metadata/regions/shape_attributes/x",
+                "_via_img_metadata/regions/shape_attributes/y",
+                "_via_img_metadata/size",
+                "_via_settings",
+                "_via_settings/core",
+                "_via_settings/project",
+                "_via_settings/ui",
+            ],
+        ),
+        (
+            "default_COCO_schema",
+            [
+                "annotations",
+                "annotations/area",
+                "annotations/bbox",
+                "annotations/category_id",
+                "annotations/id",
+                "annotations/image_id",
+                "annotations/iscrowd",
+                "categories",
+                "categories/id",
+                "categories/name",
+                "categories/supercategory",
+                "images",
+                "images/file_name",
+                "images/height",
+                "images/id",
+                "images/width",
+                "info",
+                "licenses",
+            ],
+        ),
+    ],
+)
+def test_extract_properties_keys(
+    schema: dict,
+    expected_properties_keys: list,
+    request: pytest.FixtureRequest,
+):
+    """Test the _extract_properties_keys helper function."""
+    schema = request.getfixturevalue(schema)
+    assert _extract_properties_keys(schema) == sorted(expected_properties_keys)
+
+
+@pytest.mark.parametrize(
+    "list_required_keys, data_dict, additional_message, expected_exception",
+    [
+        (
+            ["images", "annotations", "categories"],
+            {"images": "", "annotations": "", "categories": ""},
+            "",
+            does_not_raise(),
+        ),  # zero missing keys
+        (
+            ["images", "annotations", "categories"],
+            {"annotations": "", "categories": ""},
+            "",
+            pytest.raises(ValueError),
+        ),  # one missing key
+        (
+            ["images", "annotations", "categories"],
+            {"annotations": ""},
+            "",
+            pytest.raises(ValueError),
+        ),  # two missing keys
+        (
+            ["images", "annotations", "categories"],
+            {"annotations": "", "categories": ""},
+            "FOO",
+            pytest.raises(ValueError),
+        ),  # one missing key with additional message
+    ],
+)
+def test_check_required_keys_in_dict(
+    list_required_keys: list,
+    data_dict: dict,
+    additional_message: str,
+    expected_exception: pytest.raises,
+):
+    """Test the _check_required_keys_in_dict helper function."""
+    with expected_exception as excinfo:
+        _check_required_keys_in_dict(
+            list_required_keys, data_dict, additional_message
+        )
+
+    if excinfo:
+        missing_keys = set(list_required_keys) - data_dict.keys()
+        assert str(excinfo.value) == (
+            f"Required key(s) {sorted(missing_keys)} "
+            f"not found{additional_message}."
+        )
+
+
+def test_check_required_properties_keys(small_schema: dict):
+    """Test the _check_required_keys helper function."""
+    # Define a sample schema from "small_schema"
+    # with a "properties" key missing (e.g. "c/c2")
+    small_schema["properties"]["c"]["properties"].pop("c2")
+
+    # Define required "properties" keys
+    required_keys = ["a", "b", "c/c2"]
+
+    # Run check
+    with pytest.raises(ValueError) as excinfo:
+        _check_required_properties_keys(required_keys, small_schema)
+
+    # Check error message
+    assert "Required key(s) ['c/c2'] not found in schema" in str(excinfo.value)
+
+
+@pytest.mark.parametrize(
+    "input_file,",
+    [
+        "VIA_JSON_sample_1.json",
+        "VIA_JSON_sample_2.json",
+    ],
+)
+def test_required_keys_in_VIA_schema(
+    input_file: str, default_VIA_schema: dict, annotations_test_data: dict
+):
+    """Check the provided VIA schema contains the ValidVIA required keys."""
+    # Get required keys from a VIA valid file
+    filepath = annotations_test_data[input_file]
+    valid_VIA = ValidVIA(path=filepath)
+    required_VIA_keys = valid_VIA.required_keys
+
+    # Map required keys to "properties" keys in schema
+    map_required_to_properties_keys = {
+        "main": "",
+        "images": "_via_img_metadata",
+        "regions": "_via_img_metadata/regions",
+        "shape_attributes": "_via_img_metadata/regions/shape_attributes",
+    }
+
+    # Express required keys as required "properties" keys
+    required_property_keys = [
+        val if ky == "main" else f"{map_required_to_properties_keys[ky]}/{val}"
+        for ky, values in required_VIA_keys.items()
+        for val in values
+    ]
+
+    # Run check
+    _check_required_properties_keys(
+        required_property_keys,
+        default_VIA_schema,
+    )
+
+
+@pytest.mark.parametrize(
+    "input_file,",
+    [
+        "COCO_JSON_sample_1.json",
+        "COCO_JSON_sample_2.json",
+    ],
+)
+def test_required_keys_in_COCO_schema(
+    input_file: str, default_COCO_schema: dict, annotations_test_data: dict
+):
+    """Check the provided COCO schema contains the ValidCOCO required keys."""
+    # Get required keys from a COCO valid file
+    filepath = annotations_test_data[input_file]
+    valid_COCO = ValidCOCO(path=filepath)
+    required_COCO_keys = valid_COCO.required_keys
+
+    # Prepare list of required "properties" keys with full paths
+    required_properties_keys = [
+        f"{level}/{ky}" if level != "main" else ky
+        for level, required_keys in required_COCO_keys.items()
+        for ky in required_keys
+    ]
+
+    # Run check
+    _check_required_properties_keys(
+        required_properties_keys,
+        default_COCO_schema,
+    )