Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions azure-slurm-install/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -1132,6 +1132,7 @@ def main() -> None:
# create the users
setup_users(settings)

set_hostname(settings)
# create the munge key and/or copy it to /etc/munge/
munge_key(settings)

Expand All @@ -1152,8 +1153,10 @@ def main() -> None:
minute="*/5",
command=f"{settings.autoscale_dir}/return_to_idle.sh 1>&2 >> {settings.autoscale_dir}/logs/return_to_idle.log",
)

set_hostname(settings)
if settings.is_primary_scheduler == False:
# This is the HA node.
logging.info(f"Secondary Scheduler {settings.secondary_scheduler_name} starting wait on primary to finish converging.")
ilib.await_node_converge(settings.config, "scheduler", timeout=3600)

if settings.mode == "execute":
setup_slurmd(settings)
Expand Down
23 changes: 23 additions & 0 deletions azure-slurm-install/installlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -615,6 +615,29 @@ def await_node_hostname(
)


def await_node_converge(
config: Dict,
node_name: str,
timeout=3600,
cluster_status_func: Callable[[Dict], Dict] = cluster_status,
) -> CCNode:
"""
Waits for node_name to fully converge.
"""
omega = timeout + time()
while time() < omega:
referenced_node = get_ccnode(config, node_name, cluster_status_func)
if referenced_node.status == "Ready":
return referenced_node
logging.debug(
"Waiting for node to converge %s",
referenced_node.hostname,
)
sleep(10)
raise RuntimeError(
f"Node {node_name} did not converge in {timeout} seconds"
)

def is_valid_hostname(config: Dict, node: CCNode) -> bool:
"""
See await_node_hostname for details.
Expand Down
Loading