diff --git a/azure-slurm-install/install.py b/azure-slurm-install/install.py index c4a1a755..24ca6d30 100644 --- a/azure-slurm-install/install.py +++ b/azure-slurm-install/install.py @@ -1132,6 +1132,7 @@ def main() -> None: # create the users setup_users(settings) + set_hostname(settings) # create the munge key and/or copy it to /etc/munge/ munge_key(settings) @@ -1152,8 +1153,10 @@ def main() -> None: minute="*/5", command=f"{settings.autoscale_dir}/return_to_idle.sh 1>&2 >> {settings.autoscale_dir}/logs/return_to_idle.log", ) - - set_hostname(settings) + if settings.is_primary_scheduler == False: + # This is the HA node. + logging.info(f"Secondary Scheduler {settings.secondary_scheduler_name} starting wait on primary to finish converging.") + ilib.await_node_converge(settings.config, "scheduler", timeout=600) if settings.mode == "execute": setup_slurmd(settings) diff --git a/azure-slurm-install/installlib.py b/azure-slurm-install/installlib.py index 15a309a7..cadacacf 100644 --- a/azure-slurm-install/installlib.py +++ b/azure-slurm-install/installlib.py @@ -615,6 +615,29 @@ def await_node_hostname( ) +def await_node_converge( + config: Dict, + node_name: str, + timeout=600, + cluster_status_func: Callable[[Dict], Dict] = cluster_status, +) -> CCNode: + """ + Waits for node_name to fully converge. + """ + omega = timeout + time() + while time() < omega: + referenced_node = get_ccnode(config, node_name, cluster_status_func) + if referenced_node.status == "Ready": + return referenced_node + logging.debug( + "Waiting for node to converge %s", + referenced_node.hostname, + ) + sleep(10) + raise RuntimeError( + f"Node {node_name} did not converge in {timeout} seconds" + ) + def is_valid_hostname(config: Dict, node: CCNode) -> bool: """ See await_node_hostname for details.