Fixups and testing for cli config file parsing (#722)

bpkroth · web-flow · commit 7dce3d10faac · 2024-07-23T14:58:19.000-07:00
Further fixups to #717 Some parameters were not being respected from `--config test-cli-config.jsonc` files. Split out from #720
diff --git a/mlos_bench/mlos_bench/config/schedulers/sync_scheduler.jsonc b/mlos_bench/mlos_bench/config/schedulers/sync_scheduler.jsonc
@@ -6,7 +6,7 @@
 
     "config": {
         "trial_config_repeat_count": 3,
-        "max_trials": -1,  // Limited only in hte Optimizer logic/config.
+        "max_trials": -1,  // Limited only in the Optimizer logic/config.
         "teardown": false
     }
 }
diff --git a/mlos_bench/mlos_bench/launcher.py b/mlos_bench/mlos_bench/launcher.py
@@ -44,6 +44,7 @@ class Launcher:
 
     def __init__(self, description: str, long_text: str = "", argv: Optional[List[str]] = None):
         # pylint: disable=too-many-statements
+        # pylint: disable=too-many-locals
         _LOG.info("Launch: %s", description)
         epilog = """
             Additional --key=value pairs can be specified to augment or override
@@ -56,7 +57,7 @@ def __init__(self, description: str, long_text: str = "", argv: Optional[List[st
             <https://github.com/microsoft/MLOS/tree/main/mlos_bench/>
             """
         parser = argparse.ArgumentParser(description=f"{description} : {long_text}", epilog=epilog)
-        (args, args_rest) = self._parse_args(parser, argv)
+        (args, path_args, args_rest) = self._parse_args(parser, argv)
 
         # Bootstrap config loader: command line takes priority.
         config_path = args.config_path or []
@@ -87,11 +88,25 @@ def __init__(self, description: str, long_text: str = "", argv: Optional[List[st
 
         self._parent_service: Service = LocalExecService(parent=self._config_loader)
 
+        # Prepare global_config from a combination of global config files, cli
+        # configs, and cli args.
+        args_dict = vars(args)
+        # teardown (bool) conflicts with Environment configs that use it for shell
+        # commands (list), so we exclude it from copying over
+        excluded_cli_args = path_args + ["teardown"]
+        # Include (almost) any item from the cli config file that either isn't in
+        # the cli args at all or whose cli arg is missing.
+        cli_config_args = {
+            key: val
+            for (key, val) in config.items()
+            if (args_dict.get(key) is None) and key not in excluded_cli_args
+        }
+
         self.global_config = self._load_config(
-            config.get("globals", []) + (args.globals or []),
-            (args.config_path or []) + config.get("config_path", []),
-            args_rest,
-            {key: val for (key, val) in config.items() if key not in vars(args)},
+            args_globals=config.get("globals", []) + (args.globals or []),
+            config_path=(args.config_path or []) + config.get("config_path", []),
+            args_rest=args_rest,
+            global_config=cli_config_args,
         )
         # experiment_id is generally taken from --globals files, but we also allow
         # overriding it on the CLI.
@@ -168,19 +183,35 @@ def service(self) -> Service:
     def _parse_args(
         parser: argparse.ArgumentParser,
         argv: Optional[List[str]],
-    ) -> Tuple[argparse.Namespace, List[str]]:
+    ) -> Tuple[argparse.Namespace, List[str], List[str]]:
         """Parse the command line arguments."""
-        parser.add_argument(
+
+        class PathArgsTracker:
+            """Simple class to help track which arguments are paths."""
+
+            def __init__(self, parser: argparse.ArgumentParser):
+                self._parser = parser
+                self.path_args: List[str] = []
+
+            def add_argument(self, *args: Any, **kwargs: Any) -> None:
+                """Add an argument to the parser and track its destination."""
+                self.path_args.append(self._parser.add_argument(*args, **kwargs).dest)
+
+        path_args_tracker = PathArgsTracker(parser)
+
+        path_args_tracker.add_argument(
             "--config",
             required=False,
-            help="Main JSON5 configuration file. Its keys are the same as the"
-            + " command line options and can be overridden by the latter.\n"
-            + "\n"
-            + " See the `mlos_bench/config/` tree at https://github.com/microsoft/MLOS/ "
-            + " for additional config examples for this and other arguments.",
+            help=(
+                "Main JSON5 configuration file. Its keys are the same as the "
+                "command line options and can be overridden by the latter.\n"
+                "\n"
+                "See the `mlos_bench/config/` tree at https://github.com/microsoft/MLOS/ "
+                "for additional config examples for this and other arguments."
+            ),
         )
 
-        parser.add_argument(
+        path_args_tracker.add_argument(
             "--log_file",
             "--log-file",
             required=False,
@@ -192,11 +223,13 @@ def _parse_args(
             "--log-level",
             required=False,
             type=str,
-            help=f"Logging level. Default is {logging.getLevelName(_LOG_LEVEL)}."
-            + " Set to DEBUG for debug, WARNING for warnings only.",
+            help=(
+                f"Logging level. Default is {logging.getLevelName(_LOG_LEVEL)}. "
+                "Set to DEBUG for debug, WARNING for warnings only."
+            ),
         )
 
-        parser.add_argument(
+        path_args_tracker.add_argument(
             "--config_path",
             "--config-path",
             "--config-paths",
@@ -207,7 +240,7 @@ def _parse_args(
             help="One or more locations of JSON config files.",
         )
 
-        parser.add_argument(
+        path_args_tracker.add_argument(
             "--service",
             "--services",
             nargs="+",
@@ -219,17 +252,19 @@ def _parse_args(
             ),
         )
 
-        parser.add_argument(
+        path_args_tracker.add_argument(
             "--environment",
             required=False,
             help="Path to JSON file with the configuration of the benchmarking environment(s).",
         )
 
-        parser.add_argument(
+        path_args_tracker.add_argument(
             "--optimizer",
             required=False,
-            help="Path to the optimizer configuration file. If omitted, run"
-            + " a single trial with default (or specified in --tunable_values).",
+            help=(
+                "Path to the optimizer configuration file. If omitted, run "
+                "a single trial with default (or specified in --tunable_values)."
+            ),
         )
 
         parser.add_argument(
@@ -243,18 +278,22 @@ def _parse_args(
             ),
         )
 
-        parser.add_argument(
+        path_args_tracker.add_argument(
             "--scheduler",
             required=False,
-            help="Path to the scheduler configuration file. By default, use"
-            + " a single worker synchronous scheduler.",
+            help=(
+                "Path to the scheduler configuration file. By default, use "
+                "a single worker synchronous scheduler."
+            ),
         )
 
-        parser.add_argument(
+        path_args_tracker.add_argument(
             "--storage",
             required=False,
-            help="Path to the storage configuration file."
-            + " If omitted, use the ephemeral in-memory SQL storage.",
+            help=(
+                "Path to the storage configuration file. "
+                "If omitted, use the ephemeral in-memory SQL storage."
+            ),
         )
 
         parser.add_argument(
@@ -275,24 +314,28 @@ def _parse_args(
             help="Seed to use with --random_init",
         )
 
-        parser.add_argument(
+        path_args_tracker.add_argument(
             "--tunable_values",
             "--tunable-values",
             nargs="+",
             action="extend",
             required=False,
-            help="Path to one or more JSON files that contain values of the tunable"
-            + " parameters. This can be used for a single trial (when no --optimizer"
-            + " is specified) or as default values for the first run in optimization.",
+            help=(
+                "Path to one or more JSON files that contain values of the tunable "
+                "parameters. This can be used for a single trial (when no --optimizer "
+                "is specified) or as default values for the first run in optimization."
+            ),
         )
 
-        parser.add_argument(
+        path_args_tracker.add_argument(
             "--globals",
             nargs="+",
             action="extend",
             required=False,
-            help="Path to one or more JSON files that contain additional"
-            + " [private] parameters of the benchmarking environment.",
+            help=(
+                "Path to one or more JSON files that contain additional "
+                "[private] parameters of the benchmarking environment."
+            ),
         )
 
         parser.add_argument(
@@ -328,7 +371,7 @@ def _parse_args(
             argv = sys.argv[1:].copy()
         (args, args_rest) = parser.parse_known_args(argv)
 
-        return (args, args_rest)
+        return (args, path_args_tracker.path_args, args_rest)
 
     @staticmethod
     def _try_parse_extra_args(cmdline: Iterable[str]) -> Dict[str, TunableValue]:
@@ -361,6 +404,7 @@ def _try_parse_extra_args(cmdline: Iterable[str]) -> Dict[str, TunableValue]:
 
     def _load_config(
         self,
+        *,
         args_globals: Iterable[str],
         config_path: Iterable[str],
         args_rest: Iterable[str],
diff --git a/mlos_bench/mlos_bench/optimizers/base_optimizer.py b/mlos_bench/mlos_bench/optimizers/base_optimizer.py
@@ -135,20 +135,23 @@ def __exit__(
     @property
     def current_iteration(self) -> int:
         """
-        The current number of iterations (trials) registered.
+        The current number of iterations (suggestions) registered.
 
         Note: this may or may not be the same as the number of configurations.
-        See Also: Launcher.trial_config_repeat_count.
+        See Also: Scheduler.trial_config_repeat_count and Scheduler.max_trials.
         """
         return self._iter
 
+    # TODO: finish renaming iterations to suggestions.
+    # See Also: https://github.com/microsoft/MLOS/pull/713
+
     @property
     def max_iterations(self) -> int:
         """
-        The maximum number of iterations (trials) to run.
+        The maximum number of iterations (suggestions) to run.
 
         Note: this may or may not be the same as the number of configurations.
-        See Also: Launcher.trial_config_repeat_count.
+        See Also: Scheduler.trial_config_repeat_count and Scheduler.max_trials.
         """
         return self._max_iter
 
diff --git a/mlos_bench/mlos_bench/schedulers/base_scheduler.py b/mlos_bench/mlos_bench/schedulers/base_scheduler.py
@@ -14,6 +14,7 @@
 from pytz import UTC
 from typing_extensions import Literal
 
+from mlos_bench.config.schemas import ConfigSchema
 from mlos_bench.environments.base_environment import Environment
 from mlos_bench.optimizers.base_optimizer import Optimizer
 from mlos_bench.storage.base_storage import Storage
@@ -64,6 +65,7 @@ def __init__(  # pylint: disable=too-many-arguments
             source=global_config,
             required_keys=["experiment_id", "trial_id"],
         )
+        self._validate_json_config(config)
 
         self._experiment_id = config["experiment_id"].strip()
         self._trial_id = int(config["trial_id"])
@@ -88,6 +90,36 @@ def __init__(  # pylint: disable=too-many-arguments
 
         _LOG.debug("Scheduler instantiated: %s :: %s", self, config)
 
+    def _validate_json_config(self, config: dict) -> None:
+        """Reconstructs a basic json config that this class might have been instantiated
+        from in order to validate configs provided outside the file loading
+        mechanism.
+        """
+        json_config: dict = {
+            "class": self.__class__.__module__ + "." + self.__class__.__name__,
+        }
+        if config:
+            json_config["config"] = config.copy()
+            # The json schema does not allow for -1 as a valid value for config_id.
+            # As it is just a default placeholder value, and not required, we can
+            # remove it from the config copy prior to validation safely.
+            config_id = json_config["config"].get("config_id")
+            if config_id is not None and isinstance(config_id, int) and config_id < 0:
+                json_config["config"].pop("config_id")
+        ConfigSchema.SCHEDULER.validate(json_config)
+
+    @property
+    def trial_config_repeat_count(self) -> int:
+        """Gets the number of trials to run for a given config."""
+        return self._trial_config_repeat_count
+
+    @property
+    def max_trials(self) -> int:
+        """Gets the maximum number of trials to run for a given experiment, or -1 for no
+        limit.
+        """
+        return self._max_trials
+
     def __repr__(self) -> str:
         """
         Produce a human-readable version of the Scheduler (mostly for logging).
diff --git a/mlos_bench/mlos_bench/tests/config/cli/test-cli-config.jsonc b/mlos_bench/mlos_bench/tests/config/cli/test-cli-config.jsonc
@@ -17,7 +17,7 @@
         "services/remote/mock/mock_fileshare_service.jsonc"
     ],
 
-    "trial_config_repeat_count": 1,
+    "trial_config_repeat_count": 2,
 
     "random_seed": 42,
     "random_init": true
diff --git a/mlos_bench/mlos_bench/tests/launcher_parse_args_test.py b/mlos_bench/mlos_bench/tests/launcher_parse_args_test.py

Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@`
`6`	`6`
`7`	`7`	`"config": {`
`8`	`8`	`"trial_config_repeat_count": 3,`
`9`		`- "max_trials": -1, // Limited only in hte Optimizer logic/config.`
	`9`	`+ "max_trials": -1, // Limited only in the Optimizer logic/config.`
`10`	`10`	`"teardown": false`
`11`	`11`	`}`
`12`	`12`	`}`