From 6ac5a7b7532881dfedbd932a8e7cbdfc55cb8ee1 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Thu, 12 Dec 2024 17:46:54 +0100 Subject: [PATCH 01/14] Add even vertical partitioner --- .../flwr_datasets/partitioner/__init__.py | 2 + .../partitioner/vertical_even_partitioner.py | 226 ++++++++++++++++++ .../vertical_even_partitioner_test.py | 201 ++++++++++++++++ .../partitioner/vertical_partitioner_utils.py | 102 ++++++++ .../vertical_partitioner_utils_test.py | 144 +++++++++++ 5 files changed, 675 insertions(+) create mode 100644 datasets/flwr_datasets/partitioner/vertical_even_partitioner.py create mode 100644 datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py create mode 100644 datasets/flwr_datasets/partitioner/vertical_partitioner_utils.py create mode 100644 datasets/flwr_datasets/partitioner/vertical_partitioner_utils_test.py diff --git a/datasets/flwr_datasets/partitioner/__init__.py b/datasets/flwr_datasets/partitioner/__init__.py index a14efa1cc905..59f647f44b16 100644 --- a/datasets/flwr_datasets/partitioner/__init__.py +++ b/datasets/flwr_datasets/partitioner/__init__.py @@ -29,6 +29,7 @@ from .shard_partitioner import ShardPartitioner from .size_partitioner import SizePartitioner from .square_partitioner import SquarePartitioner +from .vertical_even_partitioner import VerticalEvenPartitioner __all__ = [ "DirichletPartitioner", @@ -45,4 +46,5 @@ "ShardPartitioner", "SizePartitioner", "SquarePartitioner", + "VerticalEvenPartitioner", ] diff --git a/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py b/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py new file mode 100644 index 000000000000..6a6df3df35a0 --- /dev/null +++ b/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py @@ -0,0 +1,226 @@ +# Copyright 2024 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""VerticalEvenPartitioner class.""" +# flake8: noqa: E501 +from typing import Literal, Optional, Union + +import numpy as np + +import datasets +from flwr_datasets.partitioner.partitioner import Partitioner +from flwr_datasets.partitioner.vertical_partitioner_utils import ( + _add_active_party_columns, + _list_split, +) + + +class VerticalEvenPartitioner(Partitioner): + """Partitioner that splits features (columns) evenly into vertical partitions. + + Enables selection of "active party" column(s) and palcement into + a specific partition or creation of a new partition just for it. + Also enables droping columns and sharing specified columns across + all partitions. + + The number and nature of partitions can be defined in various ways: + - By specifying a simple integer for even splitting. + - By providing ratios or absolute counts for each partition. + - By explicitly listing the columns for each partition. + (see `column_distribution` and `mode` parameters for more details) + + Parameters + ---------- + num_partitions : int + Number of partitions to create. + active_party_columns : Optional[list[str]] + Columns associated with the "active party" (which can be the server). + active_party_columns_mode : Union[Literal[["add_to_first", "add_to_last", "create_as_first", "create_as_last", "add_to_all"], int] + Determines how to assign the active party columns: + - "add_to_first": Append active party columns to the first partition. + - "add_to_last": Append active party columns to the last partition. + - int: Append active party columns to the specified partition index. + - "create_as_first": Create a new partition at the start containing only + these columns. + - "create_as_last": Create a new partition at the end containing only + these columns. + - "add_to_all": Append active party columns to all partitions. + drop_columns : Optional[list[str]] + Columns to remove entirely from the dataset before partitioning. + shared_columns : Optional[list[str]] + Columns to duplicate into every partition after initial partitioning. + shuffle : bool + Whether to shuffle the order of columns before partitioning. + seed : Optional[int] + Random seed for shuffling columns. Has no effect if `shuffle=False`. + + Examples + -------- + >>> partitioner = VerticalEvenPartitioner( + ... num_partitions=3, + ... active_party_columns=["income"], + ... active_party_columns_mode="add_to_last", + ... shuffle=True, + ... seed=42 + ... ) + >>> fds = FederatedDataset( + ... dataset="scikit-learn/adult-census-income", + ... partitioners={"train": partitioner} + ... ) + >>> partitions = [fds.load_partition(i) for i in range(partitioner.num_partitions)] + >>> print([partition.column_names for partition in partitions]) + """ + + def __init__( + self, + num_partitions: int, + active_party_columns: Optional[list[str]] = None, + active_party_columns_mode: Union[ + Literal[ + "add_to_first", + "add_to_last", + "create_as_first", + "create_as_last", + "add_to_all", + ], + int, + ] = "add_to_last", + drop_columns: Optional[list[str]] = None, + shared_columns: Optional[list[str]] = None, + shuffle: bool = True, + seed: Optional[int] = 42, + ) -> None: + super().__init__() + + self._num_partitions = num_partitions + self._active_party_columns = active_party_columns or [] + self._active_party_columns_mode = active_party_columns_mode + self._drop_columns = drop_columns or [] + self._shared_columns = shared_columns or [] + self._shuffle = shuffle + self._seed = seed + self._rng = np.random.default_rng(seed=self._seed) + + self._partition_columns: Optional[list[list[str]]] = None + self._partitions_determined = False + + self._validate_parameters_in_init() + + def _determine_partitions_if_needed(self) -> None: + if self._partitions_determined: + return + + if self.dataset is None: + raise ValueError("No dataset is set for this partitioner.") + + all_columns = list(self.dataset.column_names) + self._validate_parameters_while_partitioning( + all_columns, self._shared_columns, self._active_party_columns + ) + columns = [column for column in all_columns if column not in self._drop_columns] + columns = [column for column in columns if column not in self._shared_columns] + columns = [ + column for column in columns if column not in self._active_party_columns + ] + + if self._shuffle: + self._rng.shuffle(columns) + partition_columns = _list_split(columns, self._num_partitions) + partition_columns = _add_active_party_columns( + self._active_party_columns, + self._active_party_columns_mode, + partition_columns, + ) + + # Add shared columns to all partitions + for partition in partition_columns: + for column in self._shared_columns: + partition.append(column) + + self._partition_columns = partition_columns + self._partitions_determined = True + + def load_partition(self, partition_id: int) -> datasets.Dataset: + """Load a partition based on the partition index. + + Parameters + ---------- + partition_id : int + The index that corresponds to the requested partition. + + Returns + ------- + dataset_partition : Dataset + Single partition of a dataset. + """ + self._determine_partitions_if_needed() + assert self._partition_columns is not None + if partition_id < 0 or partition_id >= len(self._partition_columns): + raise ValueError(f"Invalid partition_id {partition_id}.") + columns = self._partition_columns[partition_id] + return self.dataset.select_columns(columns) + + @property + def num_partitions(self) -> int: + """Number of partitions.""" + self._determine_partitions_if_needed() + assert self._partition_columns is not None + return len(self._partition_columns) + + def _validate_parameters_in_init(self) -> None: + if self._num_partitions < 1: + raise ValueError("column_distribution as int must be >= 1.") + + # Validate columns lists + for parameter_name, parameter_list in [ + ("drop_columns", self._drop_columns), + ("shared_columns", self._shared_columns), + ("active_party_columns", self._active_party_columns), + ]: + if not all(isinstance(column, str) for column in parameter_list): + raise ValueError(f"All entries in {parameter_name} must be strings.") + + valid_modes = { + "add_to_first", + "add_to_last", + "create_as_first", + "create_as_last", + "add_to_all", + } + if not ( + isinstance(self._active_party_columns_mode, int) + or self._active_party_columns_mode in valid_modes + ): + raise ValueError( + "active_party_columns_mode must be an int or one of " + "'add_to_first', 'add_to_last', 'create_as_first', 'create_as_last', " + "'add_to_all'." + ) + + def _validate_parameters_while_partitioning( + self, + all_columns: list[str], + shared_columns: list[str], + active_party_columns: list[str], + ) -> None: + # Shared columns existance check + for column in shared_columns: + if column not in all_columns: + raise ValueError(f"Shared column '{column}' not found in the dataset.") + # Active party columns existence check + for column in active_party_columns: + if column not in all_columns: + raise ValueError( + f"Active party column '{column}' not found in the dataset." + ) diff --git a/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py b/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py new file mode 100644 index 000000000000..3b35208706c2 --- /dev/null +++ b/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py @@ -0,0 +1,201 @@ +# Copyright 2024 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""VerticalEvenPartitioner class tests.""" +# mypy: disable-error-code=list-item,arg-type +import unittest + +import numpy as np + +from datasets import Dataset +from flwr_datasets.partitioner.vertical_even_partitioner import VerticalEvenPartitioner + + +def _create_dummy_dataset(column_names: list[str], num_rows: int = 100) -> Dataset: + """Create a dummy dataset with random data for testing.""" + data = {} + rng = np.random.default_rng(seed=42) + for col in column_names: + # Just numeric data; could also be strings, categoricals, etc. + data[col] = rng.integers(0, 100, size=num_rows).tolist() + return Dataset.from_dict(data) + + +class TestVerticalEvenPartitioner(unittest.TestCase): + """Unit tests for VerticalEvenPartitioner.""" + + def test_init_with_invalid_num_partitions(self) -> None: + """Test that initializing with an invalid number of partitions.""" + with self.assertRaises(ValueError): + VerticalEvenPartitioner(num_partitions=0) + + def test_init_with_invalid_active_party_mode(self) -> None: + """Test initialization with invalid active_party_columns_mode.""" + with self.assertRaises(ValueError): + VerticalEvenPartitioner( + num_partitions=2, active_party_columns_mode="invalid_mode" + ) + + def test_init_with_non_string_drop_columns(self) -> None: + """Test initialization with non-string elements in drop_columns.""" + with self.assertRaises(ValueError): + VerticalEvenPartitioner(num_partitions=2, drop_columns=[1, "a", 3]) + + def test_init_with_non_string_shared_columns(self) -> None: + """Test initialization with non-string elements in shared_columns.""" + with self.assertRaises(ValueError): + VerticalEvenPartitioner(num_partitions=2, shared_columns=["col1", 123]) + + def test_init_with_non_string_active_party_columns(self) -> None: + """Test initialization with non-string elements in active_party_columns.""" + with self.assertRaises(ValueError): + VerticalEvenPartitioner( + num_partitions=2, active_party_columns=["col1", None] + ) + + def test_partitioning_basic(self) -> None: + """Test basic partitioning with no special columns or dropping.""" + columns = ["feature1", "feature2", "feature3", "feature4"] + dataset = _create_dummy_dataset(columns, num_rows=50) + partitioner = VerticalEvenPartitioner(num_partitions=2, shuffle=False) + partitioner.dataset = dataset + + self.assertEqual(partitioner.num_partitions, 2) + + p0 = partitioner.load_partition(0) + p1 = partitioner.load_partition(1) + + self.assertEqual(len(p0.column_names), 2) + self.assertEqual(len(p1.column_names), 2) + self.assertIn("feature1", p0.column_names) + self.assertIn("feature2", p0.column_names) + self.assertIn("feature3", p1.column_names) + self.assertIn("feature4", p1.column_names) + + def test_partitioning_with_drop_columns(self) -> None: + """Test partitioning while dropping some columns.""" + columns = ["feature1", "feature2", "drop_me", "feature3", "feature4"] + dataset = _create_dummy_dataset(columns, num_rows=50) + partitioner = VerticalEvenPartitioner( + num_partitions=2, drop_columns=["drop_me"], shuffle=False, seed=42 + ) + partitioner.dataset = dataset + + p0 = partitioner.load_partition(0) + p1 = partitioner.load_partition(1) + all_partition_columns = p0.column_names + p1.column_names + + # The drop_me should not be in any partition + self.assertNotIn("drop_me", all_partition_columns) + # The rest of columns should be distributed + self.assertIn("feature1", all_partition_columns) + self.assertIn("feature2", all_partition_columns) + self.assertIn("feature3", all_partition_columns) + self.assertIn("feature4", all_partition_columns) + + def test_partitioning_with_shared_columns(self) -> None: + """Test that shared columns are present in all partitions.""" + columns = ["f1", "f2", "f3", "f4", "shared_col"] + dataset = _create_dummy_dataset(columns, num_rows=50) + partitioner = VerticalEvenPartitioner( + num_partitions=2, shared_columns=["shared_col"], shuffle=False, seed=42 + ) + partitioner.dataset = dataset + + p0 = partitioner.load_partition(0) + p1 = partitioner.load_partition(1) + + self.assertIn("shared_col", p0.column_names) + self.assertIn("shared_col", p1.column_names) + + def test_partitioning_with_active_party_columns_add_to_last(self) -> None: + """Test active party columns are appended to the last partition.""" + columns = ["f1", "f2", "f3", "f4", "income"] + dataset = _create_dummy_dataset(columns, num_rows=50) + partitioner = VerticalEvenPartitioner( + num_partitions=2, + active_party_columns=["income"], + active_party_columns_mode="add_to_last", + shuffle=False, + seed=42, + ) + partitioner.dataset = dataset + + p0 = partitioner.load_partition(0) + p1 = partitioner.load_partition(1) + + # The income should be only in the last partition + self.assertNotIn("income", p0.column_names) + self.assertIn("income", p1.column_names) + + def test_partitioning_with_active_party_columns_create_as_first(self) -> None: + """Test creating a new partition solely for active party columns.""" + columns = ["f1", "f2", "f3", "f4", "income"] + dataset = _create_dummy_dataset(columns, num_rows=50) + partitioner = VerticalEvenPartitioner( + num_partitions=2, + active_party_columns=["income"], + active_party_columns_mode="create_as_first", + shuffle=False, + ) + partitioner.dataset = dataset + + # The first partition should be just the active party columns + # and then two more partitions from original splitting. + self.assertEqual(partitioner.num_partitions, 3) + + p0 = partitioner.load_partition(0) # active party partition + p1 = partitioner.load_partition(1) + p2 = partitioner.load_partition(2) + + self.assertEqual(p0.column_names, ["income"]) + self.assertIn("f1", p1.column_names) + self.assertIn("f2", p1.column_names) + self.assertIn("f3", p2.column_names) + self.assertIn("f4", p2.column_names) + + def test_partitioning_with_nonexistent_active_party_columns(self) -> None: + """Test that a ValueError is raised if active party column does not exist.""" + columns = ["f1", "f2", "f3", "f4"] + dataset = _create_dummy_dataset(columns, num_rows=50) + partitioner = VerticalEvenPartitioner( + num_partitions=2, + active_party_columns=["income"], # Not present in dataset + active_party_columns_mode="add_to_last", + shuffle=False, + ) + partitioner.dataset = dataset + + with self.assertRaises(ValueError) as context: + partitioner.load_partition(0) + self.assertIn("Active party column 'income' not found", str(context.exception)) + + def test_partitioning_with_nonexistent_shared_columns(self) -> None: + """Test that a ValueError is raised if shared column does not exist.""" + columns = ["f1", "f2", "f3"] + dataset = _create_dummy_dataset(columns, num_rows=50) + partitioner = VerticalEvenPartitioner( + num_partitions=2, shared_columns=["nonexistent_col"], shuffle=False + ) + partitioner.dataset = dataset + + with self.assertRaises(ValueError) as context: + partitioner.load_partition(0) + self.assertIn( + "Shared column 'nonexistent_col' not found", str(context.exception) + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/datasets/flwr_datasets/partitioner/vertical_partitioner_utils.py b/datasets/flwr_datasets/partitioner/vertical_partitioner_utils.py new file mode 100644 index 000000000000..8859bec6c675 --- /dev/null +++ b/datasets/flwr_datasets/partitioner/vertical_partitioner_utils.py @@ -0,0 +1,102 @@ +# Copyright 2024 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""VerticalPartitioner utils.py.""" +# flake8: noqa: E501 +from typing import Any, Literal, Union + + +def _list_split(lst: list[Any], num_sublists: int) -> list[list[Any]]: + """Split a list into n nearly equal-sized sublists. + + Parameters + ---------- + lst : list[Any] + The list to split. + num_sublists : int + Number of sublists to create. + + Returns + ------- + subslist: list[list[Any]] + A list containing num_sublists sublists. + """ + if num_sublists <= 0: + raise ValueError("Number of splits must be greater than 0") + chunk_size, remainder = divmod(len(lst), num_sublists) + sublists = [] + start_index = 0 + for i in range(num_sublists): + end_index = start_index + chunk_size + if i < remainder: + end_index += 1 + sublists.append(lst[start_index:end_index]) + start_index = end_index + return sublists + + +def _add_active_party_columns( + active_party_columns: list[str], + active_party_columns_mode: Union[ + Literal[ + "add_to_first", + "add_to_last", + "create_as_first", + "create_as_last", + "add_to_all", + ], + int, + ], + partition_columns: list[list[str]], +) -> list[list[str]]: + """Add active party columns to the partition columns based on the mode. + + Parameters + ---------- + active_party_columns : list[str] + List of active party columns. + active_party_columns_mode : Union[Literal["add_to_first", "add_to_last", "create_as_first", "create_as_last", "add_to_all"], int] + Mode to add active party columns to partition columns. + + Returns + ------- + partition_columns: list[list[str]] + List of partition columns after the modyfication. + """ + if isinstance(active_party_columns_mode, int): + partition_id = active_party_columns_mode + if partition_id < 0 or partition_id >= len(partition_columns): + raise ValueError( + f"Invalid partition index {partition_id} for active_party_columns_mode." + f"Must be in the range [0, {len(partition_columns) - 1}]" + f"but given {partition_id}" + ) + for column in active_party_columns: + partition_columns[partition_id].append(column) + else: + if active_party_columns_mode == "add_to_first": + for column in active_party_columns: + partition_columns[0].append(column) + elif active_party_columns_mode == "add_to_last": + for column in active_party_columns: + partition_columns[-1].append(column) + elif active_party_columns_mode == "create_as_first": + partition_columns.insert(0, active_party_columns) + elif active_party_columns_mode == "create_as_last": + partition_columns.append(active_party_columns) + elif active_party_columns_mode == "add_to_all": + for column in active_party_columns: + for partition in partition_columns: + partition.append(column) + return partition_columns diff --git a/datasets/flwr_datasets/partitioner/vertical_partitioner_utils_test.py b/datasets/flwr_datasets/partitioner/vertical_partitioner_utils_test.py new file mode 100644 index 000000000000..f85d027fe444 --- /dev/null +++ b/datasets/flwr_datasets/partitioner/vertical_partitioner_utils_test.py @@ -0,0 +1,144 @@ +# Copyright 2024 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for vertical partitioner utilities.""" +import unittest +from typing import Any, Literal + +from flwr_datasets.partitioner.vertical_partitioner_utils import ( + _add_active_party_columns, + _list_split, +) + + +class TestVerticalPartitionerUtils(unittest.TestCase): + """Tests for _list_split and _add_active_party_columns utilities.""" + + def test_list_split_basic_splitting(self) -> None: + """Check equal splitting with divisible lengths.""" + lst = [1, 2, 3, 4, 5, 6] + result = _list_split(lst, 3) + expected = [[1, 2], [3, 4], [5, 6]] + self.assertEqual(result, expected) + + def test_list_split_uneven_splitting(self) -> None: + """Check uneven splitting with non-divisible lengths.""" + lst = [10, 20, 30, 40, 50] + result = _list_split(lst, 2) + expected = [[10, 20, 30], [40, 50]] + self.assertEqual(result, expected) + + def test_list_split_single_sublist(self) -> None: + """Check that single sublist returns the full list.""" + lst = [1, 2, 3] + result = _list_split(lst, 1) + expected = [[1, 2, 3]] + self.assertEqual(result, expected) + + def test_list_split_more_sublists_than_elements(self) -> None: + """Check extra sublists are empty when count exceeds length.""" + lst = [42] + result = _list_split(lst, 3) + expected = [[42], [], []] + self.assertEqual(result, expected) + + def test_list_split_empty_list(self) -> None: + """Check splitting empty list produces empty sublists.""" + lst: list[Any] = [] + result = _list_split(lst, 3) + expected: list[list[Any]] = [[], [], []] + self.assertEqual(result, expected) + + def test_list_split_invalid_num_sublists(self) -> None: + """Check ValueError when sublist count is zero or negative.""" + lst = [1, 2, 3] + with self.assertRaises(ValueError): + _list_split(lst, 0) + + def test_add_to_first(self) -> None: + """Check adding active cols to the first partition.""" + partition_columns = [["col1", "col2"], ["col3"], ["col4"]] + active_party_columns = ["active1", "active2"] + mode: Literal["add_to_first"] = "add_to_first" + result = _add_active_party_columns( + active_party_columns, mode, partition_columns + ) + self.assertEqual( + result, [["col1", "col2", "active1", "active2"], ["col3"], ["col4"]] + ) + + def test_add_to_last(self) -> None: + """Check adding active cols to the last partition.""" + partition_columns = [["col1", "col2"], ["col3"], ["col4"]] + active_party_columns = ["active"] + mode: Literal["add_to_last"] = "add_to_last" + result = _add_active_party_columns( + active_party_columns, mode, partition_columns + ) + self.assertEqual(result, [["col1", "col2"], ["col3"], ["col4", "active"]]) + + def test_create_as_first(self) -> None: + """Check creating a new first partition for active cols.""" + partition_columns = [["col1"], ["col2"]] + active_party_columns = ["active1", "active2"] + mode: Literal["create_as_first"] = "create_as_first" + result = _add_active_party_columns( + active_party_columns, mode, partition_columns + ) + self.assertEqual(result, [["active1", "active2"], ["col1"], ["col2"]]) + + def test_create_as_last(self) -> None: + """Check creating a new last partition for active cols.""" + partition_columns = [["col1"], ["col2"]] + active_party_columns = ["active1", "active2"] + mode: Literal["create_as_last"] = "create_as_last" + result = _add_active_party_columns( + active_party_columns, mode, partition_columns + ) + self.assertEqual(result, [["col1"], ["col2"], ["active1", "active2"]]) + + def test_add_to_all(self) -> None: + """Check adding active cols to all partitions.""" + partition_columns = [["col1"], ["col2", "col3"], ["col4"]] + active_party_columns = ["active"] + mode: Literal["add_to_all"] = "add_to_all" + result = _add_active_party_columns( + active_party_columns, mode, partition_columns + ) + self.assertEqual( + result, [["col1", "active"], ["col2", "col3", "active"], ["col4", "active"]] + ) + + def test_add_to_specific_partition_valid_index(self) -> None: + """Check adding active cols to a specific valid partition.""" + partition_columns = [["col1"], ["col2"], ["col3"]] + active_party_columns = ["active1", "active2"] + mode: int = 1 + result = _add_active_party_columns( + active_party_columns, mode, partition_columns + ) + self.assertEqual(result, [["col1"], ["col2", "active1", "active2"], ["col3"]]) + + def test_add_to_specific_partition_invalid_index(self) -> None: + """Check ValueError when partition index is invalid.""" + partition_columns = [["col1"], ["col2"]] + active_party_columns = ["active"] + mode: int = 5 + with self.assertRaises(ValueError) as context: + _add_active_party_columns(active_party_columns, mode, partition_columns) + self.assertIn("Invalid partition index", str(context.exception)) + + +if __name__ == "__main__": + unittest.main() From 7b56c1fe191d05282617ecd3b174658330390582 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Fri, 13 Dec 2024 09:34:13 +0100 Subject: [PATCH 02/14] Fix formatting errors --- .../flwr_datasets/partitioner/vertical_even_partitioner.py | 1 + .../partitioner/vertical_even_partitioner_test.py | 5 +++-- .../flwr_datasets/partitioner/vertical_partitioner_utils.py | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py b/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py index 6a6df3df35a0..180c4bd07347 100644 --- a/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py +++ b/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py @@ -14,6 +14,7 @@ # ============================================================================== """VerticalEvenPartitioner class.""" # flake8: noqa: E501 +# pylint: disable=C0301, R0902, R0913 from typing import Literal, Optional, Union import numpy as np diff --git a/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py b/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py index 3b35208706c2..8e766617d609 100644 --- a/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================== """VerticalEvenPartitioner class tests.""" -# mypy: disable-error-code=list-item,arg-type +# mypy: disable-error-code=list-item import unittest import numpy as np @@ -44,7 +44,8 @@ def test_init_with_invalid_active_party_mode(self) -> None: """Test initialization with invalid active_party_columns_mode.""" with self.assertRaises(ValueError): VerticalEvenPartitioner( - num_partitions=2, active_party_columns_mode="invalid_mode" + num_partitions=2, + active_party_columns_mode="invalid_mode", # type: ignore[arg-type] ) def test_init_with_non_string_drop_columns(self) -> None: diff --git a/datasets/flwr_datasets/partitioner/vertical_partitioner_utils.py b/datasets/flwr_datasets/partitioner/vertical_partitioner_utils.py index 8859bec6c675..e9e7e3855ef4 100644 --- a/datasets/flwr_datasets/partitioner/vertical_partitioner_utils.py +++ b/datasets/flwr_datasets/partitioner/vertical_partitioner_utils.py @@ -14,6 +14,7 @@ # ============================================================================== """VerticalPartitioner utils.py.""" # flake8: noqa: E501 +# pylint: disable=C0301 from typing import Any, Literal, Union From 31e10203253e46643f1aa7f1f774d684dbad609a Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Fri, 13 Dec 2024 11:47:42 +0100 Subject: [PATCH 03/14] Remove outdated docstring --- .../flwr_datasets/partitioner/vertical_even_partitioner.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py b/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py index 180c4bd07347..54d70b7c8389 100644 --- a/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py +++ b/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py @@ -35,12 +35,6 @@ class VerticalEvenPartitioner(Partitioner): Also enables droping columns and sharing specified columns across all partitions. - The number and nature of partitions can be defined in various ways: - - By specifying a simple integer for even splitting. - - By providing ratios or absolute counts for each partition. - - By explicitly listing the columns for each partition. - (see `column_distribution` and `mode` parameters for more details) - Parameters ---------- num_partitions : int From 8e143183896be4bc9cf2d9ca405e407eb831160c Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Fri, 13 Dec 2024 12:30:40 +0100 Subject: [PATCH 04/14] Update naming convention for active_party_column --- .../partitioner/vertical_even_partitioner.py | 39 ++++++++++--------- .../vertical_even_partitioner_test.py | 28 ++++++------- 2 files changed, 34 insertions(+), 33 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py b/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py index 54d70b7c8389..3a382d8e2a2e 100644 --- a/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py +++ b/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py @@ -39,9 +39,10 @@ class VerticalEvenPartitioner(Partitioner): ---------- num_partitions : int Number of partitions to create. - active_party_columns : Optional[list[str]] - Columns associated with the "active party" (which can be the server). - active_party_columns_mode : Union[Literal[["add_to_first", "add_to_last", "create_as_first", "create_as_last", "add_to_all"], int] + active_party_column : Optional[Union[str, list[str]]] + Column(s) (typically representing labels) associated with the + "active party" (which can be the server). + active_party_column_mode : Union[Literal[["add_to_first", "add_to_last", "create_as_first", "create_as_last", "add_to_all"], int] Determines how to assign the active party columns: - "add_to_first": Append active party columns to the first partition. - "add_to_last": Append active party columns to the last partition. @@ -64,8 +65,8 @@ class VerticalEvenPartitioner(Partitioner): -------- >>> partitioner = VerticalEvenPartitioner( ... num_partitions=3, - ... active_party_columns=["income"], - ... active_party_columns_mode="add_to_last", + ... active_party_column=["income"], + ... active_party_column_mode="add_to_last", ... shuffle=True, ... seed=42 ... ) @@ -80,8 +81,8 @@ class VerticalEvenPartitioner(Partitioner): def __init__( self, num_partitions: int, - active_party_columns: Optional[list[str]] = None, - active_party_columns_mode: Union[ + active_party_column: Optional[list[str]] = None, + active_party_column_mode: Union[ Literal[ "add_to_first", "add_to_last", @@ -99,8 +100,8 @@ def __init__( super().__init__() self._num_partitions = num_partitions - self._active_party_columns = active_party_columns or [] - self._active_party_columns_mode = active_party_columns_mode + self._active_party_column = active_party_column or [] + self._active_party_column_mode = active_party_column_mode self._drop_columns = drop_columns or [] self._shared_columns = shared_columns or [] self._shuffle = shuffle @@ -121,20 +122,20 @@ def _determine_partitions_if_needed(self) -> None: all_columns = list(self.dataset.column_names) self._validate_parameters_while_partitioning( - all_columns, self._shared_columns, self._active_party_columns + all_columns, self._shared_columns, self._active_party_column ) columns = [column for column in all_columns if column not in self._drop_columns] columns = [column for column in columns if column not in self._shared_columns] columns = [ - column for column in columns if column not in self._active_party_columns + column for column in columns if column not in self._active_party_column ] if self._shuffle: self._rng.shuffle(columns) partition_columns = _list_split(columns, self._num_partitions) partition_columns = _add_active_party_columns( - self._active_party_columns, - self._active_party_columns_mode, + self._active_party_column, + self._active_party_column_mode, partition_columns, ) @@ -181,7 +182,7 @@ def _validate_parameters_in_init(self) -> None: for parameter_name, parameter_list in [ ("drop_columns", self._drop_columns), ("shared_columns", self._shared_columns), - ("active_party_columns", self._active_party_columns), + ("active_party_column", self._active_party_column), ]: if not all(isinstance(column, str) for column in parameter_list): raise ValueError(f"All entries in {parameter_name} must be strings.") @@ -194,11 +195,11 @@ def _validate_parameters_in_init(self) -> None: "add_to_all", } if not ( - isinstance(self._active_party_columns_mode, int) - or self._active_party_columns_mode in valid_modes + isinstance(self._active_party_column_mode, int) + or self._active_party_column_mode in valid_modes ): raise ValueError( - "active_party_columns_mode must be an int or one of " + "active_party_column_mode must be an int or one of " "'add_to_first', 'add_to_last', 'create_as_first', 'create_as_last', " "'add_to_all'." ) @@ -207,14 +208,14 @@ def _validate_parameters_while_partitioning( self, all_columns: list[str], shared_columns: list[str], - active_party_columns: list[str], + active_party_column: list[str], ) -> None: # Shared columns existance check for column in shared_columns: if column not in all_columns: raise ValueError(f"Shared column '{column}' not found in the dataset.") # Active party columns existence check - for column in active_party_columns: + for column in active_party_column: if column not in all_columns: raise ValueError( f"Active party column '{column}' not found in the dataset." diff --git a/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py b/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py index 8e766617d609..b561fa11ce06 100644 --- a/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py @@ -41,11 +41,11 @@ def test_init_with_invalid_num_partitions(self) -> None: VerticalEvenPartitioner(num_partitions=0) def test_init_with_invalid_active_party_mode(self) -> None: - """Test initialization with invalid active_party_columns_mode.""" + """Test initialization with invalid active_party_column_mode.""" with self.assertRaises(ValueError): VerticalEvenPartitioner( num_partitions=2, - active_party_columns_mode="invalid_mode", # type: ignore[arg-type] + active_party_column_mode="invalid_mode", # type: ignore[arg-type] ) def test_init_with_non_string_drop_columns(self) -> None: @@ -58,11 +58,11 @@ def test_init_with_non_string_shared_columns(self) -> None: with self.assertRaises(ValueError): VerticalEvenPartitioner(num_partitions=2, shared_columns=["col1", 123]) - def test_init_with_non_string_active_party_columns(self) -> None: - """Test initialization with non-string elements in active_party_columns.""" + def test_init_with_non_string_active_party_column(self) -> None: + """Test initialization with non-string elements in active_party_column.""" with self.assertRaises(ValueError): VerticalEvenPartitioner( - num_partitions=2, active_party_columns=["col1", None] + num_partitions=2, active_party_column=["col1", None] ) def test_partitioning_basic(self) -> None: @@ -120,14 +120,14 @@ def test_partitioning_with_shared_columns(self) -> None: self.assertIn("shared_col", p0.column_names) self.assertIn("shared_col", p1.column_names) - def test_partitioning_with_active_party_columns_add_to_last(self) -> None: + def test_partitioning_with_active_party_column_add_to_last(self) -> None: """Test active party columns are appended to the last partition.""" columns = ["f1", "f2", "f3", "f4", "income"] dataset = _create_dummy_dataset(columns, num_rows=50) partitioner = VerticalEvenPartitioner( num_partitions=2, - active_party_columns=["income"], - active_party_columns_mode="add_to_last", + active_party_column=["income"], + active_party_column_mode="add_to_last", shuffle=False, seed=42, ) @@ -140,14 +140,14 @@ def test_partitioning_with_active_party_columns_add_to_last(self) -> None: self.assertNotIn("income", p0.column_names) self.assertIn("income", p1.column_names) - def test_partitioning_with_active_party_columns_create_as_first(self) -> None: + def test_partitioning_with_active_party_column_create_as_first(self) -> None: """Test creating a new partition solely for active party columns.""" columns = ["f1", "f2", "f3", "f4", "income"] dataset = _create_dummy_dataset(columns, num_rows=50) partitioner = VerticalEvenPartitioner( num_partitions=2, - active_party_columns=["income"], - active_party_columns_mode="create_as_first", + active_party_column=["income"], + active_party_column_mode="create_as_first", shuffle=False, ) partitioner.dataset = dataset @@ -166,14 +166,14 @@ def test_partitioning_with_active_party_columns_create_as_first(self) -> None: self.assertIn("f3", p2.column_names) self.assertIn("f4", p2.column_names) - def test_partitioning_with_nonexistent_active_party_columns(self) -> None: + def test_partitioning_with_nonexistent_active_party_column(self) -> None: """Test that a ValueError is raised if active party column does not exist.""" columns = ["f1", "f2", "f3", "f4"] dataset = _create_dummy_dataset(columns, num_rows=50) partitioner = VerticalEvenPartitioner( num_partitions=2, - active_party_columns=["income"], # Not present in dataset - active_party_columns_mode="add_to_last", + active_party_column=["income"], # Not present in dataset + active_party_column_mode="add_to_last", shuffle=False, ) partitioner.dataset = dataset From a73f623873c225f39f1db5d6b8aea35d430fe573 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Fri, 13 Dec 2024 12:31:59 +0100 Subject: [PATCH 05/14] Update naming convention for active_party_column --- .../flwr_datasets/partitioner/vertical_even_partitioner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py b/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py index 3a382d8e2a2e..7088a65eec1c 100644 --- a/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py +++ b/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py @@ -65,7 +65,7 @@ class VerticalEvenPartitioner(Partitioner): -------- >>> partitioner = VerticalEvenPartitioner( ... num_partitions=3, - ... active_party_column=["income"], + ... active_party_column="income", ... active_party_column_mode="add_to_last", ... shuffle=True, ... seed=42 @@ -81,7 +81,7 @@ class VerticalEvenPartitioner(Partitioner): def __init__( self, num_partitions: int, - active_party_column: Optional[list[str]] = None, + active_party_column: Optional[Union[str, list[str]]] = None, active_party_column_mode: Union[ Literal[ "add_to_first", From 7418767e06f38c3b30393c55e6adc5fce9f8fe83 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Thu, 19 Dec 2024 13:44:12 +0100 Subject: [PATCH 06/14] Make naming and docstrings consistent --- .../partitioner/vertical_even_partitioner.py | 37 +++++++++---------- .../vertical_even_partitioner_test.py | 22 +++++------ 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py b/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py index 7088a65eec1c..e5960e7a10fd 100644 --- a/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py +++ b/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py @@ -42,16 +42,15 @@ class VerticalEvenPartitioner(Partitioner): active_party_column : Optional[Union[str, list[str]]] Column(s) (typically representing labels) associated with the "active party" (which can be the server). - active_party_column_mode : Union[Literal[["add_to_first", "add_to_last", "create_as_first", "create_as_last", "add_to_all"], int] + active_party_columns_mode : Union[Literal[["add_to_first", "add_to_last", "create_as_first", "create_as_last", "add_to_all"], int] Determines how to assign the active party columns: - - "add_to_first": Append active party columns to the first partition. - - "add_to_last": Append active party columns to the last partition. + + - `"add_to_first"`: Append active party columns to the first partition. + - `"add_to_last"`: Append active party columns to the last partition. + - `"create_as_first"`: Create a new partition at the start containing only these columns. + - `"create_as_last"`: Create a new partition at the end containing only these columns. + - `"add_to_all"`: Append active party columns to all partitions. - int: Append active party columns to the specified partition index. - - "create_as_first": Create a new partition at the start containing only - these columns. - - "create_as_last": Create a new partition at the end containing only - these columns. - - "add_to_all": Append active party columns to all partitions. drop_columns : Optional[list[str]] Columns to remove entirely from the dataset before partitioning. shared_columns : Optional[list[str]] @@ -81,8 +80,8 @@ class VerticalEvenPartitioner(Partitioner): def __init__( self, num_partitions: int, - active_party_column: Optional[Union[str, list[str]]] = None, - active_party_column_mode: Union[ + active_party_columns: Optional[Union[str, list[str]]] = None, + active_party_columns_mode: Union[ Literal[ "add_to_first", "add_to_last", @@ -100,8 +99,8 @@ def __init__( super().__init__() self._num_partitions = num_partitions - self._active_party_column = active_party_column or [] - self._active_party_column_mode = active_party_column_mode + self._active_party_columns = active_party_columns or [] + self._active_party_columns_mode = active_party_columns_mode self._drop_columns = drop_columns or [] self._shared_columns = shared_columns or [] self._shuffle = shuffle @@ -122,20 +121,20 @@ def _determine_partitions_if_needed(self) -> None: all_columns = list(self.dataset.column_names) self._validate_parameters_while_partitioning( - all_columns, self._shared_columns, self._active_party_column + all_columns, self._shared_columns, self._active_party_columns ) columns = [column for column in all_columns if column not in self._drop_columns] columns = [column for column in columns if column not in self._shared_columns] columns = [ - column for column in columns if column not in self._active_party_column + column for column in columns if column not in self._active_party_columns ] if self._shuffle: self._rng.shuffle(columns) partition_columns = _list_split(columns, self._num_partitions) partition_columns = _add_active_party_columns( - self._active_party_column, - self._active_party_column_mode, + self._active_party_columns, + self._active_party_columns_mode, partition_columns, ) @@ -182,7 +181,7 @@ def _validate_parameters_in_init(self) -> None: for parameter_name, parameter_list in [ ("drop_columns", self._drop_columns), ("shared_columns", self._shared_columns), - ("active_party_column", self._active_party_column), + ("active_party_column", self._active_party_columns), ]: if not all(isinstance(column, str) for column in parameter_list): raise ValueError(f"All entries in {parameter_name} must be strings.") @@ -195,8 +194,8 @@ def _validate_parameters_in_init(self) -> None: "add_to_all", } if not ( - isinstance(self._active_party_column_mode, int) - or self._active_party_column_mode in valid_modes + isinstance(self._active_party_columns_mode, int) + or self._active_party_columns_mode in valid_modes ): raise ValueError( "active_party_column_mode must be an int or one of " diff --git a/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py b/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py index b561fa11ce06..dc37bde48ee0 100644 --- a/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py @@ -41,11 +41,11 @@ def test_init_with_invalid_num_partitions(self) -> None: VerticalEvenPartitioner(num_partitions=0) def test_init_with_invalid_active_party_mode(self) -> None: - """Test initialization with invalid active_party_column_mode.""" + """Test initialization with invalid active_party_columns_mode.""" with self.assertRaises(ValueError): VerticalEvenPartitioner( num_partitions=2, - active_party_column_mode="invalid_mode", # type: ignore[arg-type] + active_party_columns_mode="invalid_mode", # type: ignore[arg-type] ) def test_init_with_non_string_drop_columns(self) -> None: @@ -62,7 +62,7 @@ def test_init_with_non_string_active_party_column(self) -> None: """Test initialization with non-string elements in active_party_column.""" with self.assertRaises(ValueError): VerticalEvenPartitioner( - num_partitions=2, active_party_column=["col1", None] + num_partitions=2, active_party_columns=["col1", None] ) def test_partitioning_basic(self) -> None: @@ -120,14 +120,14 @@ def test_partitioning_with_shared_columns(self) -> None: self.assertIn("shared_col", p0.column_names) self.assertIn("shared_col", p1.column_names) - def test_partitioning_with_active_party_column_add_to_last(self) -> None: + def test_partitioning_with_active_party_columns_add_to_last(self) -> None: """Test active party columns are appended to the last partition.""" columns = ["f1", "f2", "f3", "f4", "income"] dataset = _create_dummy_dataset(columns, num_rows=50) partitioner = VerticalEvenPartitioner( num_partitions=2, - active_party_column=["income"], - active_party_column_mode="add_to_last", + active_party_columns=["income"], + active_party_columns_mode="add_to_last", shuffle=False, seed=42, ) @@ -140,14 +140,14 @@ def test_partitioning_with_active_party_column_add_to_last(self) -> None: self.assertNotIn("income", p0.column_names) self.assertIn("income", p1.column_names) - def test_partitioning_with_active_party_column_create_as_first(self) -> None: + def test_partitioning_with_active_party_columns_create_as_first(self) -> None: """Test creating a new partition solely for active party columns.""" columns = ["f1", "f2", "f3", "f4", "income"] dataset = _create_dummy_dataset(columns, num_rows=50) partitioner = VerticalEvenPartitioner( num_partitions=2, - active_party_column=["income"], - active_party_column_mode="create_as_first", + active_party_columns=["income"], + active_party_columns_mode="create_as_first", shuffle=False, ) partitioner.dataset = dataset @@ -172,8 +172,8 @@ def test_partitioning_with_nonexistent_active_party_column(self) -> None: dataset = _create_dummy_dataset(columns, num_rows=50) partitioner = VerticalEvenPartitioner( num_partitions=2, - active_party_column=["income"], # Not present in dataset - active_party_column_mode="add_to_last", + active_party_columns=["income"], # Not present in dataset + active_party_columns_mode="add_to_last", shuffle=False, ) partitioner.dataset = dataset From c506786469ca511cdd014339e3a69f9992fb9ba0 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Thu, 19 Dec 2024 14:22:12 +0100 Subject: [PATCH 07/14] Fix the types of auxilary fncs --- .../flwr_datasets/partitioner/vertical_even_partitioner.py | 4 +++- .../flwr_datasets/partitioner/vertical_partitioner_utils.py | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py b/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py index e5960e7a10fd..1afdb7c04271 100644 --- a/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py +++ b/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py @@ -207,8 +207,10 @@ def _validate_parameters_while_partitioning( self, all_columns: list[str], shared_columns: list[str], - active_party_column: list[str], + active_party_column: Union[str, list[str]], ) -> None: + if isinstance(active_party_column, str): + active_party_column = [active_party_column] # Shared columns existance check for column in shared_columns: if column not in all_columns: diff --git a/datasets/flwr_datasets/partitioner/vertical_partitioner_utils.py b/datasets/flwr_datasets/partitioner/vertical_partitioner_utils.py index e9e7e3855ef4..097cde81e576 100644 --- a/datasets/flwr_datasets/partitioner/vertical_partitioner_utils.py +++ b/datasets/flwr_datasets/partitioner/vertical_partitioner_utils.py @@ -48,7 +48,7 @@ def _list_split(lst: list[Any], num_sublists: int) -> list[list[Any]]: def _add_active_party_columns( - active_party_columns: list[str], + active_party_columns: Union[str, list[str]], active_party_columns_mode: Union[ Literal[ "add_to_first", @@ -65,7 +65,7 @@ def _add_active_party_columns( Parameters ---------- - active_party_columns : list[str] + active_party_columns : Union[str, list[str]] List of active party columns. active_party_columns_mode : Union[Literal["add_to_first", "add_to_last", "create_as_first", "create_as_last", "add_to_all"], int] Mode to add active party columns to partition columns. @@ -75,6 +75,8 @@ def _add_active_party_columns( partition_columns: list[list[str]] List of partition columns after the modyfication. """ + if isinstance(active_party_columns, str): + active_party_columns = [active_party_columns] if isinstance(active_party_columns_mode, int): partition_id = active_party_columns_mode if partition_id < 0 or partition_id >= len(partition_columns): From 3fdae3fe1a45dfc337d1b7cae7676bdd1eba9809 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Thu, 19 Dec 2024 14:30:32 +0100 Subject: [PATCH 08/14] Fix tests --- .../flwr_datasets/partitioner/vertical_partitioner_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/flwr_datasets/partitioner/vertical_partitioner_utils.py b/datasets/flwr_datasets/partitioner/vertical_partitioner_utils.py index 097cde81e576..d232f0c80207 100644 --- a/datasets/flwr_datasets/partitioner/vertical_partitioner_utils.py +++ b/datasets/flwr_datasets/partitioner/vertical_partitioner_utils.py @@ -47,7 +47,7 @@ def _list_split(lst: list[Any], num_sublists: int) -> list[list[Any]]: return sublists -def _add_active_party_columns( +def _add_active_party_columns( # pylint: disable=R0912 active_party_columns: Union[str, list[str]], active_party_columns_mode: Union[ Literal[ From 333ae86617bd2a596986d68a2b42e1840a6a8bf8 Mon Sep 17 00:00:00 2001 From: Javier Date: Thu, 19 Dec 2024 14:32:31 +0000 Subject: [PATCH 09/14] Apply suggestions from code review --- .../partitioner/vertical_even_partitioner.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py b/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py index 1afdb7c04271..d27e753e6b57 100644 --- a/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py +++ b/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py @@ -30,7 +30,7 @@ class VerticalEvenPartitioner(Partitioner): """Partitioner that splits features (columns) evenly into vertical partitions. - Enables selection of "active party" column(s) and palcement into + Enables selection of "active party" column(s) and placement into a specific partition or creation of a new partition just for it. Also enables droping columns and sharing specified columns across all partitions. @@ -175,7 +175,7 @@ def num_partitions(self) -> int: def _validate_parameters_in_init(self) -> None: if self._num_partitions < 1: - raise ValueError("column_distribution as int must be >= 1.") + raise ValueError("`column_distribution` as int must be >= 1.") # Validate columns lists for parameter_name, parameter_list in [ @@ -184,7 +184,7 @@ def _validate_parameters_in_init(self) -> None: ("active_party_column", self._active_party_columns), ]: if not all(isinstance(column, str) for column in parameter_list): - raise ValueError(f"All entries in {parameter_name} must be strings.") + raise ValueError(f"All entries in '{parameter_name}' must be strings.") valid_modes = { "add_to_first", @@ -198,7 +198,7 @@ def _validate_parameters_in_init(self) -> None: or self._active_party_columns_mode in valid_modes ): raise ValueError( - "active_party_column_mode must be an int or one of " + "`active_party_column_mode` must be an int or one of " "'add_to_first', 'add_to_last', 'create_as_first', 'create_as_last', " "'add_to_all'." ) From 382fbd60ddd1b564c26c17b075bc3a1e9b80d8f9 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 19 Dec 2024 15:44:48 +0100 Subject: [PATCH 10/14] fix docstrings example --- .../partitioner/vertical_even_partitioner.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py b/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py index d27e753e6b57..87a80428f0c1 100644 --- a/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py +++ b/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py @@ -62,10 +62,13 @@ class VerticalEvenPartitioner(Partitioner): Examples -------- + >>> from flwr_datasets import FederatedDataset + >>> from flwr_datasets.partitioner import VerticalEvenPartitioner + >>> >>> partitioner = VerticalEvenPartitioner( ... num_partitions=3, - ... active_party_column="income", - ... active_party_column_mode="add_to_last", + ... active_party_columns="income", + ... active_party_columns_mode="add_to_last", ... shuffle=True, ... seed=42 ... ) @@ -73,7 +76,7 @@ class VerticalEvenPartitioner(Partitioner): ... dataset="scikit-learn/adult-census-income", ... partitioners={"train": partitioner} ... ) - >>> partitions = [fds.load_partition(i) for i in range(partitioner.num_partitions)] + >>> partitions = [fds.load_partition(i) for i in range(fds.partitioners["train"].num_partitions)] >>> print([partition.column_names for partition in partitions]) """ From ed3ddde53a68dc790f6d010cc8a28efe6876c2e5 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Thu, 19 Dec 2024 16:41:37 +0100 Subject: [PATCH 11/14] Move out the str or list[str] checks to inits --- .../partitioner/vertical_even_partitioner.py | 22 ++++++---------- .../partitioner/vertical_partitioner_utils.py | 26 +++++++++++++++++++ 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py b/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py index 1afdb7c04271..d1485c029562 100644 --- a/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py +++ b/datasets/flwr_datasets/partitioner/vertical_even_partitioner.py @@ -23,6 +23,7 @@ from flwr_datasets.partitioner.partitioner import Partitioner from flwr_datasets.partitioner.vertical_partitioner_utils import ( _add_active_party_columns, + _init_optional_str_or_list_str, _list_split, ) @@ -91,18 +92,20 @@ def __init__( ], int, ] = "add_to_last", - drop_columns: Optional[list[str]] = None, - shared_columns: Optional[list[str]] = None, + drop_columns: Optional[Union[str, list[str]]] = None, + shared_columns: Optional[Union[str, list[str]]] = None, shuffle: bool = True, seed: Optional[int] = 42, ) -> None: super().__init__() self._num_partitions = num_partitions - self._active_party_columns = active_party_columns or [] + self._active_party_columns = _init_optional_str_or_list_str( + active_party_columns + ) self._active_party_columns_mode = active_party_columns_mode - self._drop_columns = drop_columns or [] - self._shared_columns = shared_columns or [] + self._drop_columns = _init_optional_str_or_list_str(drop_columns) + self._shared_columns = _init_optional_str_or_list_str(shared_columns) self._shuffle = shuffle self._seed = seed self._rng = np.random.default_rng(seed=self._seed) @@ -177,15 +180,6 @@ def _validate_parameters_in_init(self) -> None: if self._num_partitions < 1: raise ValueError("column_distribution as int must be >= 1.") - # Validate columns lists - for parameter_name, parameter_list in [ - ("drop_columns", self._drop_columns), - ("shared_columns", self._shared_columns), - ("active_party_column", self._active_party_columns), - ]: - if not all(isinstance(column, str) for column in parameter_list): - raise ValueError(f"All entries in {parameter_name} must be strings.") - valid_modes = { "add_to_first", "add_to_last", diff --git a/datasets/flwr_datasets/partitioner/vertical_partitioner_utils.py b/datasets/flwr_datasets/partitioner/vertical_partitioner_utils.py index d232f0c80207..f9c40a1b0554 100644 --- a/datasets/flwr_datasets/partitioner/vertical_partitioner_utils.py +++ b/datasets/flwr_datasets/partitioner/vertical_partitioner_utils.py @@ -103,3 +103,29 @@ def _add_active_party_columns( # pylint: disable=R0912 for partition in partition_columns: partition.append(column) return partition_columns + + +def _init_optional_str_or_list_str(parameter: Union[str, list[str], None]) -> list[str]: + """Initialize a parameter as a list of strings. + + Parameters + ---------- + parameter : Union[str, list[str], None] + A parameter that should be a string, a list of strings, or None. + + Returns + ------- + parameter: list[str] + The parameter as a list of strings. + """ + if parameter is None: + return [] + if not isinstance(parameter, (str, list)): + raise TypeError("Parameter must be a string or a list of strings") + if isinstance(parameter, list) and not all( + isinstance(single_param, str) for single_param in parameter + ): + raise TypeError("All elements in the list must be strings") + if isinstance(parameter, str): + return [parameter] + return parameter From 6b44d6adbf8a5978f51b51bbfee5bb48d4e7e624 Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Thu, 19 Dec 2024 16:44:36 +0100 Subject: [PATCH 12/14] Update datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py Co-authored-by: Javier --- .../partitioner/vertical_even_partitioner_test.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py b/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py index dc37bde48ee0..143d5b78bd51 100644 --- a/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py @@ -53,11 +53,21 @@ def test_init_with_non_string_drop_columns(self) -> None: with self.assertRaises(ValueError): VerticalEvenPartitioner(num_partitions=2, drop_columns=[1, "a", 3]) + def test_init_with_non_list_drop_columns(self) -> None: + """Test initialization with non-list elements in drop_columns.""" + with self.assertRaises(ValueError): + VerticalEvenPartitioner(num_partitions=2, drop_columns="a") + def test_init_with_non_string_shared_columns(self) -> None: """Test initialization with non-string elements in shared_columns.""" with self.assertRaises(ValueError): VerticalEvenPartitioner(num_partitions=2, shared_columns=["col1", 123]) + def test_init_with_non_list_shared_columns(self) -> None: + """Test initialization with non-list elements in shared_columns.""" + with self.assertRaises(ValueError): + VerticalEvenPartitioner(num_partitions=2, shared_columns="col1") + def test_init_with_non_string_active_party_column(self) -> None: """Test initialization with non-string elements in active_party_column.""" with self.assertRaises(ValueError): @@ -65,6 +75,11 @@ def test_init_with_non_string_active_party_column(self) -> None: num_partitions=2, active_party_columns=["col1", None] ) + def test_init_with_non_list_active_party_column(self) -> None: + """Test initialization with non-list elements in active_party_column.""" + with self.assertRaises(ValueError): + VerticalEvenPartitioner(num_partitions=2, active_party_columns="col1") + def test_partitioning_basic(self) -> None: """Test basic partitioning with no special columns or dropping.""" columns = ["feature1", "feature2", "feature3", "feature4"] From 067bcb3f7f685c2bdc9cab0892b5b6e66bffc254 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 19 Dec 2024 17:11:18 +0100 Subject: [PATCH 13/14] fix --- .../flwr_datasets/partitioner/vertical_even_partitioner_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py b/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py index 143d5b78bd51..0eae437eb0f8 100644 --- a/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py @@ -53,7 +53,7 @@ def test_init_with_non_string_drop_columns(self) -> None: with self.assertRaises(ValueError): VerticalEvenPartitioner(num_partitions=2, drop_columns=[1, "a", 3]) - def test_init_with_non_list_drop_columns(self) -> None: + def test_init_with_non_list_drop_columns(self) -> None: """Test initialization with non-list elements in drop_columns.""" with self.assertRaises(ValueError): VerticalEvenPartitioner(num_partitions=2, drop_columns="a") From d30ea88eb391f4e8b9875d97bd76d5633567f9fb Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Fri, 20 Dec 2024 09:31:46 +0100 Subject: [PATCH 14/14] Fix tests --- .../vertical_even_partitioner_test.py | 21 +++---------------- 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py b/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py index 0eae437eb0f8..aa93db7a9fd5 100644 --- a/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/vertical_even_partitioner_test.py @@ -50,36 +50,21 @@ def test_init_with_invalid_active_party_mode(self) -> None: def test_init_with_non_string_drop_columns(self) -> None: """Test initialization with non-string elements in drop_columns.""" - with self.assertRaises(ValueError): + with self.assertRaises(TypeError): VerticalEvenPartitioner(num_partitions=2, drop_columns=[1, "a", 3]) - def test_init_with_non_list_drop_columns(self) -> None: - """Test initialization with non-list elements in drop_columns.""" - with self.assertRaises(ValueError): - VerticalEvenPartitioner(num_partitions=2, drop_columns="a") - def test_init_with_non_string_shared_columns(self) -> None: """Test initialization with non-string elements in shared_columns.""" - with self.assertRaises(ValueError): + with self.assertRaises(TypeError): VerticalEvenPartitioner(num_partitions=2, shared_columns=["col1", 123]) - def test_init_with_non_list_shared_columns(self) -> None: - """Test initialization with non-list elements in shared_columns.""" - with self.assertRaises(ValueError): - VerticalEvenPartitioner(num_partitions=2, shared_columns="col1") - def test_init_with_non_string_active_party_column(self) -> None: """Test initialization with non-string elements in active_party_column.""" - with self.assertRaises(ValueError): + with self.assertRaises(TypeError): VerticalEvenPartitioner( num_partitions=2, active_party_columns=["col1", None] ) - def test_init_with_non_list_active_party_column(self) -> None: - """Test initialization with non-list elements in active_party_column.""" - with self.assertRaises(ValueError): - VerticalEvenPartitioner(num_partitions=2, active_party_columns="col1") - def test_partitioning_basic(self) -> None: """Test basic partitioning with no special columns or dropping.""" columns = ["feature1", "feature2", "feature3", "feature4"]