diff --git a/ethology/annotations/io.py b/ethology/annotations/io.py new file mode 100644 index 0000000..baa65f6 --- /dev/null +++ b/ethology/annotations/io.py @@ -0,0 +1,468 @@ +"""Module for reading and writing manually labelled annotations.""" + +import ast +import json +from collections.abc import Callable +from pathlib import Path +from typing import Literal + +import pandas as pd + +from ethology.annotations.validators import ValidCOCO, ValidVIA, ValidVIAcsv + +# definition of standard bboxes dataframe +STANDARD_BBOXES_DF_INDEX = "annotation_id" +STANDARD_BBOXES_DF_COLUMNS = [ + "image_filename", + "image_id", + "x_min", + "y_min", + "width", + "height", + "supercategory", + "category", + "image_width", + "image_height", +] # if a column is not defined, it is filled with nan + + +def df_bboxes_from_files( + file_paths: Path | list[Path], + format: Literal["VIA", "COCO", "VIAcsv"], + images_dirs: Path | list[Path] | None = None, + **kwargs, +) -> pd.DataFrame: + """Read bounding boxes annotations as a dataframe. + + Parameters + ---------- + file_paths : Path | list[Path] + Path or list of paths to the input annotations. + format : Literal["VIA", "COCO", "VIAcsv"] + Format of the input annotation files. + images_dirs : Path | list[Path], optional + Path or list of paths to the directories containing the images. + **kwargs + Additional keyword arguments to pass to the + ``pandas.DataFrame.drop_duplicates`` method. The ``ignore_index=True`` + argument is always applied to force an index reset, and the ``inplace`` + argument is set to `False` and cannot be overridden. The settings + apply if one or multiple files are read. + + Returns + ------- + pd.DataFrame + Bounding boxes annotations dataframe. The dataframe is indexed by + "annotation_id" and has the following columns: "image_filename", + "image_id", "x_min", "y_min", "width", "height", "supercategory", + "category". + + See Also + -------- + pandas.concat : Concatenate pandas objects along a particular axis. + + pandas.DataFrame.drop_duplicates : Return DataFrame with duplicate rows + removed. + + """ + # Check kwargs that are forwarded to drop_duplicates + for fixed_kwargs in ["ignore_index", "inplace"]: + if fixed_kwargs in kwargs: + raise ValueError( + f"The '{fixed_kwargs}' argument for " + "`pandas.DataFrame.drop_duplicates` may not be overridden." + ) + + if isinstance(file_paths, list): + # Read multiple files + df_all = _df_bboxes_from_multiple_files( + file_paths, format=format, **kwargs + ) + + else: + # Read single VIA file + df_all = _df_bboxes_from_single_file( + file_paths, format=format, **kwargs + ) + + # Add metadata + df_all.metadata = { + "input_files": file_paths, + "format": format, + "images_dirs": images_dirs, + } + + return df_all + + +def _df_bboxes_from_multiple_files( + list_filepaths: list[Path], + format: Literal["VIA", "COCO", "VIAcsv"], + **kwargs, +): + """Read bounding boxes annotations from multiple files. + + Parameters + ---------- + list_filepaths : list[Path] + List of input annotation filepaths. + format : Literal["VIA", "COCO", "VIAcsv"] + Format of the input files. + Currently supported formats are "VIA", "VIAcsv" and "COCO". + **kwargs + Additional keyword arguments to pass to the + ``pandas.DataFrame.drop_duplicates`` method. The ``ignore_index=True`` + argument is always applied to force an index reset, and the ``inplace`` + argument is set to `False` and cannot be overridden. The settings + apply if one or multiple files are read. + + Returns + ------- + pd.DataFrame + Bounding boxes annotations dataframe. The dataframe is indexed + by "annotation_id" and has the following columns: "image_filename", + "image_id", "x_min", "y_min", "width", "height", "supercategory", + "category". + + """ + # Get list of dataframes + df_list = [ + _df_bboxes_from_single_file(file, format=format) + for file in list_filepaths + ] + + # Concatenate with ignore_index=True, + # so that the resulting axis is labeled 0,1,…,n - 1. + # NOTE: after ignore_index=True the index name is no longer "annotation_id" + df_all = pd.concat(df_list, ignore_index=True) + + # Update image_id based on the full sorted list of image filenames + list_image_filenames = sorted(list(df_all["image_filename"].unique())) + df_all["image_id"] = df_all["image_filename"].apply( + lambda x: list_image_filenames.index(x) + ) + + # Remove duplicates + df_all = df_all.drop_duplicates(ignore_index=True, inplace=False, **kwargs) + + # Set the index name to "annotation_id" + df_all.index.name = STANDARD_BBOXES_DF_INDEX + + return df_all + + +def _df_bboxes_from_single_file( + file_path: Path, format: Literal["VIA", "COCO", "VIAcsv"], **kwargs +) -> pd.DataFrame: + """Read bounding boxes annotations from a single file. + + Parameters + ---------- + file_path : Path + Path to the input annotations file. + format : Literal["VIA", "COCO", "VIAcsv"] + Format of the input annotations file. + Currently supported formats are "VIA", "VIAcsv" and "COCO". + **kwargs + Additional keyword arguments to pass to the + ``pandas.DataFrame.drop_duplicates`` method. The ``ignore_index=True`` + argument is always applied to force an index reset, and the ``inplace`` + argument is set to `False` and cannot be overridden. The settings + apply if one or multiple files are read. + + Returns + ------- + pd.DataFrame + Bounding boxes annotations dataframe. The dataframe is indexed + by "annotation_id" and has the following columns: "image_filename", + "image_id", "x_min", "y_min", "width", "height", "supercategory", + "category". + + """ + if format == "VIA": + return _df_bboxes_from_single_specific_file( + file_path, + validator=ValidVIA, + get_rows_from_file=_df_rows_from_valid_VIA_file, + **kwargs, + ) + elif format == "COCO": + return _df_bboxes_from_single_specific_file( + file_path, + validator=ValidCOCO, + get_rows_from_file=_df_rows_from_valid_COCO_file, + **kwargs, + ) + elif format == "VIAcsv": + return _df_bboxes_from_single_specific_file( + file_path, + validator=ValidVIAcsv, + get_rows_from_file=_df_rows_from_valid_VIA_csv_file, + **kwargs, + ) + else: + raise ValueError(f"Unsupported format: {format}") + + +def _df_bboxes_from_single_specific_file( + file_path: Path, + validator: type[ValidVIA] | type[ValidCOCO] | type[ValidVIAcsv], + get_rows_from_file: Callable, + **kwargs, +) -> pd.DataFrame: + """Read bounding boxes annotations from a single specific file. + + Parameters + ---------- + file_path : Path + Path to the input annotations file. + validator : type[ValidVIA] | type[ValidCOCO] + Validator class for the input annotations file. + get_rows_from_file : Callable + Function to extract rows from the validated input annotations file. + **kwargs + Additional keyword arguments to pass to the + ``pandas.DataFrame.drop_duplicates`` method. The ``ignore_index=True`` + argument is always applied to force an index reset, and the ``inplace`` + argument is set to `False` and cannot be overridden. The settings + apply if one or multiple files are read. + + Returns + ------- + pd.DataFrame + Bounding boxes annotations dataframe. The dataframe is indexed + by "annotation_id" and has the following columns: "image_filename", + "image_id", "x_min", "y_min", "width", "height", "supercategory", + "category". + + """ + # Validate file + valid_file = validator(file_path) + + # Build dataframe from extracted rows + list_rows = get_rows_from_file(valid_file.path) + df = pd.DataFrame(list_rows) + + # Set "annotation_id" as index + # (otherwise duplicate annotations are not identified as such) + df = df.set_index(STANDARD_BBOXES_DF_INDEX) + + # Drop duplicates and reset indices. + # We use ignore_index=True so that the resulting axis is labeled 0,1,…,n-1. + # NOTE: after this the index name is no longer "annotation_id" + df = df.drop_duplicates(ignore_index=True, inplace=False, **kwargs) + + # Reorder columns to match standard columns + df = df.reindex(columns=STANDARD_BBOXES_DF_COLUMNS) + + # Set the index name to "annotation_id" + df.index.name = STANDARD_BBOXES_DF_INDEX + + # Read as standard dataframe + return df + + +def _df_rows_from_valid_VIA_file(file_path: Path) -> list[dict]: + """Extract list of rows from validated VIA JSON file. + + Parameters + ---------- + file_path : Path + Path to the validated VIA JSON file. + + Returns + ------- + list[dict] + List of rows extracted from the VIA JSON file. + + """ + # Read validated json as dict + with open(file_path) as file: + data_dict = json.load(file) + + # Prepare data + image_metadata_dict = data_dict["_via_img_metadata"] + via_image_id_list = data_dict["_via_image_id_list"] + via_attributes = data_dict["_via_attributes"] + supercategories_props = {} + if "region" in via_attributes: + supercategories_props = via_attributes["region"] + + # Map image filenames to the image keys used by VIA + # the VIA keys are strings + map_filename_to_via_img_id = { + img_dict["filename"]: ky + for ky, img_dict in image_metadata_dict.items() + } + + # Get list of rows in dataframe + list_rows = [] + annotation_id = 0 + # loop thru images + for _, img_dict in image_metadata_dict.items(): + # loop thru annotations in the image + for region in img_dict["regions"]: + # Extract region data + region_shape = region["shape_attributes"] + region_attributes = region["region_attributes"] + + # Define supercategory and category. + # We take first key in "region_attributes" as the supercategory, + # and its value as category_id_str + if region_attributes and supercategories_props: + supercategory = sorted(list(region_attributes.keys()))[0] + category_id_str = region_attributes[supercategory] + category = supercategories_props[supercategory]["options"][ + category_id_str + ] + else: + supercategory = "" + category = "" + + row = { + "annotation_id": annotation_id, + "image_filename": img_dict["filename"], + "image_id": via_image_id_list.index( + map_filename_to_via_img_id[img_dict["filename"]] + ), # integer based on the VIA image ID + "x_min": region_shape["x"], + "y_min": region_shape["y"], + "width": region_shape["width"], + "height": region_shape["height"], + "supercategory": supercategory, + "category": category, + } + + list_rows.append(row) + + # update "annotation_id" + annotation_id += 1 + + return list_rows + + +def _df_rows_from_valid_VIA_csv_file(file_path: Path): + """Extract list of rows from validated VIA CSV file. + + Parameters + ---------- + file_path : Path + Path to the validated VIA CSV file. + + Returns + ------- + list[dict] + List of rows extracted from the VIA CSV file. + + """ + # Read input csv file + df = pd.read_csv(file_path) + + # Map image filenames to unique image IDs + image_filenames = sorted(df["filename"].unique()) + map_filename_to_image_id = {f: i for i, f in enumerate(image_filenames)} + + list_rows = [] + for df_index, df_row in df.iterrows(): + annotation_id = df_index + + image_filename = df_row["filename"] + image_id = map_filename_to_image_id[image_filename] + + region_shape_attrs = ast.literal_eval( + df_row["region_shape_attributes"] + ) + region_attributes = ast.literal_eval(df_row["region_attributes"]) + + x_min = region_shape_attrs["x"] + y_min = region_shape_attrs["y"] + width = region_shape_attrs["width"] + height = region_shape_attrs["height"] + + supercategory = list(region_attributes.keys())[0] + category_id = region_attributes[supercategory] + + row = { + "annotation_id": annotation_id, + "image_filename": image_filename, + "image_id": image_id, + "x_min": x_min, + "y_min": y_min, + "width": width, + "height": height, + "supercategory": supercategory, + "category": category_id, # category ID ! + } + list_rows.append(row) + + return list_rows + + +def _df_rows_from_valid_COCO_file(file_path: Path) -> list[dict]: + """Extract list of rows from validated COCO JSON file. + + Parameters + ---------- + file_path : Path + Path to the validated COCO JSON file. + + Returns + ------- + list[dict] + List of rows extracted from the COCO JSON file. + + """ + # Read validated json as dict + with open(file_path) as file: + data_dict = json.load(file) + + # Prepare data + map_image_id_to_filename = { + img_dict["id"]: img_dict["file_name"] + for img_dict in data_dict["images"] + } + map_image_id_to_width_height = { + img_dict["id"]: (img_dict["width"], img_dict["height"]) + for img_dict in data_dict["images"] + } + + map_category_id_to_category_data = { + cat_dict["id"]: (cat_dict["name"], cat_dict["supercategory"]) + for cat_dict in data_dict["categories"] + } # category data: category name, supercategor name + + # Build standard dataframe + list_rows = [] + for annot_dict in data_dict["annotations"]: + annotation_id = annot_dict["id"] + + # image data + image_id = annot_dict["image_id"] + image_filename = map_image_id_to_filename[image_id] + image_width = map_image_id_to_width_height[image_id][0] + image_height = map_image_id_to_width_height[image_id][1] + + # bbox data + x_min, y_min, width, height = annot_dict["bbox"] + + # category data + category_id = annot_dict["category_id"] + category, supercategory = map_category_id_to_category_data[category_id] + + row = { + "annotation_id": annotation_id, + "image_filename": image_filename, + "image_id": image_id, + "image_width": image_width, + "image_height": image_height, + "x_min": x_min, + "y_min": y_min, + "width": width, + "height": height, + "supercategory": supercategory, + "category": category, + } + + list_rows.append(row) + + return list_rows diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py index e830ce8..a476f02 100644 --- a/ethology/annotations/validators.py +++ b/ethology/annotations/validators.py @@ -1,8 +1,10 @@ """Validators for supported annotation files.""" +import ast import json from pathlib import Path +import pandas as pd from attrs import define, field from ethology.annotations.json_schemas.utils import ( @@ -51,7 +53,11 @@ class ValidVIA: ) required_keys: dict = field( default={ - "main": ["_via_img_metadata", "_via_image_id_list"], + "main": [ + "_via_img_metadata", + "_via_image_id_list", + "_via_attributes", + ], "images": ["filename", "regions"], "regions": ["shape_attributes", "region_attributes"], "shape_attributes": ["x", "y", "width", "height"], @@ -191,3 +197,65 @@ def _singularise_err_msg(key): f" for {_singularise_err_msg(ky)} {instance_dict}" ), ) + + +@define +class ValidVIAcsv: + """Class for valid VIA CSV files. + + It checks the input CSV file contains the expected header and + represents rectangular bounding boxes. + + Attributes + ---------- + path : pathlib.Path + Path to the VIA CSV file, passed as an input. + required_keys : dict + The required keys for the VIA CSV file. + + Raises + ------ + ValueError + If the VIA CSV file is missing any of the required keys. + + """ + + path: Path = field() + + @path.validator + def _check_file_contains_valid_header(self, attribute, value): + """Ensure the VIA .csv file contains the expected header.""" + expected_header = [ + "filename", + "file_size", + "file_attributes", + "region_count", + "region_id", + "region_shape_attributes", + "region_attributes", + ] + + with open(value) as f: + header = f.readline().strip("\n").split(",") + if header != expected_header: + raise ValueError( + ".csv header row does not match the known format for " + "VIA .csv files. " + f"Expected {expected_header} but got {header}.", + ) + + @path.validator + def _check_region_shape(self, attribute, value): + df = pd.read_csv(value, sep=",", header=0) + + for row in df.itertuples(): + region_shape_attrs = ast.literal_eval(row.region_shape_attributes) + + # check annotation is a rectangle + if region_shape_attrs["name"] != "rect": + raise ValueError( + f"{row.filename} (row {row.Index}): " + "bounding box shape must be 'rect' (rectangular) " + "but instead got " + f"'{region_shape_attrs['name']}'.", + ) diff --git a/tests/test_unit/test_annotations/test_io.py b/tests/test_unit/test_annotations/test_io.py new file mode 100644 index 0000000..abc67a2 --- /dev/null +++ b/tests/test_unit/test_annotations/test_io.py @@ -0,0 +1,524 @@ +from collections.abc import Callable +from contextlib import nullcontext as does_not_raise +from pathlib import Path +from typing import Literal +from unittest.mock import patch + +import pandas as pd +import pytest + +from ethology.annotations.io import ( + STANDARD_BBOXES_DF_COLUMNS, + STANDARD_BBOXES_DF_INDEX, + _df_bboxes_from_multiple_files, + _df_bboxes_from_single_file, + _df_bboxes_from_single_specific_file, + _df_rows_from_valid_COCO_file, + _df_rows_from_valid_VIA_file, + df_bboxes_from_files, +) +from ethology.annotations.validators import ValidCOCO, ValidVIA + + +@pytest.fixture +def multiple_input_files(annotations_test_data: dict) -> dict: + """Fixture that returns for each format, a pair of annotation files + with their number of annotations and images. + """ + return { + "VIA": [ + { + "path": annotations_test_data["VIA_JSON_sample_1.json"], + "n_annotations": 4440, + "n_images": 50, + }, + { + "path": annotations_test_data["VIA_JSON_sample_2.json"], + "n_annotations": 3977, + "n_images": 50, + }, + ], + "COCO": [ + { + "path": annotations_test_data["COCO_JSON_sample_1.json"], + "n_annotations": 4344, + "n_images": 100, + }, + { + "path": annotations_test_data["COCO_JSON_sample_2.json"], + "n_annotations": 4618, + "n_images": 100, + }, + ], + } + + +def assert_dataframe( + df: pd.DataFrame, + expected_n_annotations: int, + expected_n_images: int, + expected_supercategories: str | list[str], + expected_categories: str | list[str], + expected_annots_per_image: int | None = None, +): + """Check that the dataframe has the expected shape and content.""" + # Check shape of dataframe + assert df.shape[0] == expected_n_annotations + + # Check annotation_id is the index name, and that IDs are unique + assert df.index.name == STANDARD_BBOXES_DF_INDEX + assert len(set(df.index)) == expected_n_annotations + + # Check number of images + assert len(df["image_filename"].unique()) == expected_n_images + assert len(df["image_id"].unique()) == expected_n_images + + # Check columns are as expected + assert df.columns.tolist() == STANDARD_BBOXES_DF_COLUMNS + + # Check supercategories are as expected + assert df["supercategory"].unique() == expected_supercategories + + # Check categories are as expected + assert df["category"].unique() == expected_categories + + # Check number of annotations per image if provided + if expected_annots_per_image: + assert all( + df.groupby("image_id").count()["x_min"] + == expected_annots_per_image + ) # count number of "x_min" values when grouping by "image_id" + + +@pytest.mark.parametrize( + "input_format", + [ + "VIA", + "COCO", + ], +) +@pytest.mark.parametrize( + "images_dirs", + [ + [Path("/path/to/images")], # single directory + [Path("/path/to/images1"), Path("/path/to/images2")], # multiple dirs + None, # no images directories + ], +) +@pytest.mark.parametrize( + "file_path, function_to_mock", + [ + ( + Path("/path/to/file"), # single file + "ethology.annotations.io._df_bboxes_from_single_file", + ), + ( + [Path("/path/to/file1"), Path("/path/to/file2")], # multiple files + "ethology.annotations.io._df_bboxes_from_multiple_files", + ), + ], +) +def test_df_bboxes_from_files( + input_format: Literal["VIA", "COCO"], + images_dirs: Path | list[Path] | None, + file_path: Path, + function_to_mock: str, +): + """Test that the general bounding boxes loading function delegates + correctly to the single or multiple file readers, and check the + metadata is added correctly. + """ + # Call general function and see if mocked function is called + with patch(function_to_mock) as mock: + df = df_bboxes_from_files( + file_path, + format=input_format, + images_dirs=images_dirs, + ) + mock.assert_called_once_with(file_path, format=input_format) + + # Check metadata + assert df.metadata["input_files"] == file_path + assert df.metadata["format"] == input_format + if images_dirs: + assert df.metadata["images_dirs"] == images_dirs + + +@pytest.mark.parametrize( + "input_format", + [ + "VIA", + "COCO", + ], +) +def test_df_bboxes_from_multiple_files( + input_format: Literal["VIA", "COCO"], multiple_input_files: dict +): + """Test that the general bounding boxes loading function reads + correctly multiple files of the supported formats. + """ + # Get format and list of files + list_files = multiple_input_files[input_format] + + # Get paths, annotations and images + list_paths = [file["path"] for file in list_files] + list_n_annotations = [file["n_annotations"] for file in list_files] + list_n_images = [file["n_images"] for file in list_files] + + # Read all files as a dataframe + df_all = _df_bboxes_from_multiple_files(list_paths, format=input_format) + + # Check dataframe + assert_dataframe( + df_all, + expected_n_annotations=sum(list_n_annotations), + expected_n_images=sum(list_n_images), + expected_supercategories="animal", + expected_categories="crab", + ) + + +@pytest.mark.parametrize( + "input_format, validator, row_function, no_error_expected", + [ + ("VIA", ValidVIA, _df_rows_from_valid_VIA_file, True), + ("COCO", ValidCOCO, _df_rows_from_valid_COCO_file, True), + ("unsupported", None, None, False), + ], +) +def test_df_bboxes_from_single_file( + input_format: Literal["VIA", "COCO"], + validator: type[ValidVIA] | type[ValidCOCO] | None, + row_function: Callable | None, + no_error_expected: bool, +): + """Test that the ``_df_bboxes_from_single_file`` function delegates + correctly into the specific format readers. + """ + file_path = Path("/mock/path/to/file") + function_to_mock = ( + "ethology.annotations.io._df_bboxes_from_single_specific_file" + ) + + # If the format is supported, check that when calling + # `_df_bboxes_from_single_file`, `_df_bboxes_from_single_specific_file` is + # called under the hood with the correct arguments + if no_error_expected: + with patch(function_to_mock) as mock: + _df_bboxes_from_single_file(file_path, input_format) + mock.assert_called_once_with( + file_path, + validator=validator, + get_rows_from_file=row_function, + ) + # If the format is not supported, check that an error is raised + else: + with pytest.raises(ValueError) as excinfo: + _df_bboxes_from_single_file(file_path, input_format) + assert "Unsupported format" in str(excinfo.value) + + +@pytest.mark.parametrize( + ( + "input_file, validator, row_function, " + "expected_n_annotations, expected_n_images" + ), + [ + ( + "VIA_JSON_sample_1.json", + ValidVIA, + _df_rows_from_valid_VIA_file, + 4440, + 50, + ), # medium VIA file + ( + "VIA_JSON_sample_2.json", + ValidVIA, + _df_rows_from_valid_VIA_file, + 3977, + 50, + ), # medium VIA file + ( + "small_bboxes_VIA.json", + ValidVIA, + _df_rows_from_valid_VIA_file, + 3, + 3, + ), # small VIA file + ( + "COCO_JSON_sample_1.json", + ValidCOCO, + _df_rows_from_valid_COCO_file, + 4344, + 100, + ), # medium COCO file + ( + "COCO_JSON_sample_2.json", + ValidCOCO, + _df_rows_from_valid_COCO_file, + 4618, + 100, + ), # medium COCO file + ( + "small_bboxes_COCO.json", + ValidCOCO, + _df_rows_from_valid_COCO_file, + 3, + 3, + ), # small COCO file + ], +) +def test_df_bboxes_from_single_specific_file( + input_file: str, + validator: type[ValidVIA] | type[ValidCOCO], + row_function: Callable, + expected_n_annotations: int, + expected_n_images: int, + annotations_test_data: dict, +): + """Test the specific bounding box format readers.""" + # Compute bboxes dataframe from a single file + df = _df_bboxes_from_single_specific_file( + file_path=annotations_test_data[input_file], + validator=validator, + get_rows_from_file=row_function, + ) + + # Check dataframe + # (we only check annotations per image in small datasets) + assert_dataframe( + df, + expected_n_annotations, + expected_n_images, + expected_supercategories="animal", + expected_categories="crab", + expected_annots_per_image=1 if expected_n_images < 5 else None, + ) + + +@pytest.mark.parametrize( + ("input_file, validator, row_function"), + [ + ( + "small_bboxes_duplicates_VIA.json", + ValidVIA, + _df_rows_from_valid_VIA_file, + ), + ( + "small_bboxes_duplicates_COCO.json", + ValidCOCO, + _df_rows_from_valid_COCO_file, + ), + ], +) +def test_df_bboxes_from_single_specific_file_duplicates( + input_file: str, + validator: type[ValidVIA] | type[ValidCOCO], + row_function: Callable, + annotations_test_data: dict, +): + """Test the specific bounding box format readers when the input file + contains duplicate annotations. + """ + # Properties of input data + # one annotation is duplicated in the first frame + expected_n_annotations_w_duplicates = 4 + expected_n_annotations_wo_duplicates = 3 + expected_n_images = 3 + + # Extract rows + rows = row_function(file_path=annotations_test_data[input_file]) + + # Check total number of annotations including duplicates + assert len(rows) == expected_n_annotations_w_duplicates + + # Compute bboxes dataframe + df = _df_bboxes_from_single_specific_file( + file_path=annotations_test_data[input_file], + validator=validator, + get_rows_from_file=row_function, + ) + + # Check dataframe has no duplicates + assert_dataframe( + df, + expected_n_annotations_wo_duplicates, + expected_n_images, + expected_supercategories="animal", + expected_categories="crab", + ) + + +@pytest.mark.parametrize( + ("input_file, validator, row_function, expected_exception"), + [ + ( + "small_bboxes_no_cat_VIA.json", + ValidVIA, + _df_rows_from_valid_VIA_file, + does_not_raise(), + ), + ( + "small_bboxes_no_cat_COCO.json", + ValidCOCO, + _df_rows_from_valid_COCO_file, + pytest.raises(KeyError), + ), + ], +) +def test_df_bboxes_from_single_specific_file_no_cat( + input_file: str, + validator: type[ValidVIA] | type[ValidCOCO], + row_function: Callable, + expected_exception: pytest.raises, + annotations_test_data: dict, +): + """Test the specific bounding box format readers when the input file + has annotations with no category. + """ + # Compute bboxes dataframe with input file that has no categories + # (this should raise an error for COCO files) + with expected_exception as excinfo: + df = _df_bboxes_from_single_specific_file( + file_path=annotations_test_data[input_file], + validator=validator, + get_rows_from_file=row_function, + ) + + # If no error expected, check that the dataframe has empty categories + if not excinfo: + assert all(df.loc[:, "category"] == "") + assert all(df.loc[:, "supercategory"] == "") + + +@pytest.mark.parametrize( + "input_file, expected_n_annotations", + [ + ("VIA_JSON_sample_1.json", 4440), + ("VIA_JSON_sample_2.json", 3977), + ("small_bboxes_VIA.json", 3), + ("small_bboxes_duplicates_VIA.json", 4), # contains duplicates + ], +) +def test_df_rows_from_valid_VIA_file( + input_file: str, + expected_n_annotations: int, + annotations_test_data: dict, +): + """Test the extraction of rows from a valid VIA file.""" + rows = _df_rows_from_valid_VIA_file( + file_path=annotations_test_data[input_file] + ) + + # Check number of rows + assert len(rows) == expected_n_annotations + + # Check each row contains required column data + # Note that "image_width" and "image_height" are not exported to the + # VIA file + for row in rows: + assert all( + key in row + for key in [STANDARD_BBOXES_DF_INDEX] + STANDARD_BBOXES_DF_COLUMNS + if key not in ["image_width", "image_height"] + ) + + +@pytest.mark.parametrize( + "input_file, expected_n_annotations", + [ + ("COCO_JSON_sample_1.json", 4344), + ("COCO_JSON_sample_2.json", 4618), + ("small_bboxes_COCO.json", 3), + ("small_bboxes_duplicates_COCO.json", 4), # contains duplicates + ], +) +def test_df_rows_from_valid_COCO_file( + input_file: str, + expected_n_annotations: int, + annotations_test_data: dict, +): + """Test the extraction of rows from a valid COCO file.""" + rows = _df_rows_from_valid_COCO_file( + file_path=annotations_test_data[input_file] + ) + + # Check number of rows + assert len(rows) == expected_n_annotations + + # Check each row contains required column data + for row in rows: + assert all( + key in row + for key in [STANDARD_BBOXES_DF_INDEX] + STANDARD_BBOXES_DF_COLUMNS + ) + + +@pytest.mark.parametrize( + "duplicates_kwargs, expected_exception", + [ + ({"ignore_index": True}, pytest.raises(ValueError)), + ({"inplace": True}, pytest.raises(ValueError)), + ({"subset": "image_id"}, does_not_raise()), + ({"keep": "last"}, does_not_raise()), + ], +) +@pytest.mark.parametrize( + "input_format, filename", + [ + ("VIA", "small_bboxes_duplicates_VIA.json"), + ("VIA", "MULTIPLE_VIA_FILES"), + ("COCO", "small_bboxes_duplicates_COCO.json"), + ("COCO", "MULTIPLE_COCO_FILES"), + ], +) +def test_df_bboxes_from_files_kwargs( + input_format: Literal["VIA", "COCO"], + filename: str | list[str], + duplicates_kwargs: dict, + expected_exception: pytest.raises, + annotations_test_data: dict, + multiple_input_files: dict, +): + # Check kwargs behaviour when passing multiple files + if "MULTIPLE" in filename: + list_files = multiple_input_files[input_format] + + input_files = [file["path"] for file in list_files] + list_n_annotations = [file["n_annotations"] for file in list_files] + list_n_images = [file["n_images"] for file in list_files] + + expected_n_annotations = sum(list_n_annotations) + expected_n_images = sum(list_n_images) + expected_annots_per_image = None + + # Check kwargs behaviour when passing a single file + else: + input_files = annotations_test_data[filename] + expected_n_annotations = 3 + expected_n_images = 3 + expected_annots_per_image = 1 + + # Compute dataframe and check if an error is raised + with expected_exception as excinfo: + df = df_bboxes_from_files( + input_files, + format=input_format, + **duplicates_kwargs, + ) + if excinfo: + assert ( + "argument for `pandas.DataFrame.drop_duplicates` " + "may not be overridden." in str(excinfo.value) + ) + + # If no error expected: check dataframe content + if expected_exception == does_not_raise(): + assert_dataframe( + df, + expected_n_annotations=expected_n_annotations, + expected_n_images=expected_n_images, + expected_supercategories="animal", + expected_categories="crab", + expected_annots_per_image=expected_annots_per_image, + )