Azure · aditigaur4 · Jan 9, 2026 · Jan 8, 2026
diff --git a/azure-slurm-install/start-services.sh b/azure-slurm-install/start-services.sh
@@ -1,14 +1,135 @@
 #!/usr/bin/env bash
 set -e
 
-if [ "$1" == "" ]; then
-    echo "Usage: $0 [scheduler|execute|login]"
-    exit 1
-fi
+run_slurmdbd_via_systemctl() {
 
-role=$1
-monitoring_enabled=$(/opt/cycle/jetpack/bin/jetpack config cyclecloud.monitoring.enabled False)
-OS=$(. /etc/os-release; echo $ID)
+    echo "Starting slurmdbd via systemctl..."
+    systemctl start slurmdbd
+
+    # Verify slurmdbd is responding
+    sleep 10
+    if ! sacctmgr ping > /dev/null 2>&1; then
+        echo "ERROR: slurmdbd started but is not responding to sacctmgr ping"
+        exit 2
+    fi
+    echo "slurmdbd is running and responding to ping"
+}
+
+run_slurmdbd() {
+
+    if [[ $(jetpack config slurm.is_primary_scheduler True) == "False" ]]; then
+        run_slurmdbd_via_systemctl
+        return
+    fi
+    # Get the slurm version from scontrol
+    slurm_version=$(scontrol --version | awk '{print $2}')
+    if [ -z "$slurm_version" ]; then
+        echo "Failed to get slurm version from scontrol --version"
+        return 1
+    fi
+    echo "Slurm version: $slurm_version"
+
+    # Define the expected startup message
+    startup_message="slurmdbd version ${slurm_version} started"
+    echo "Waiting for startup message: $startup_message"
+
+    # Create a temp file for slurmdbd output
+    log_file=$(mktemp)
+
+    # Start slurmdbd in foreground as user slurm, redirect output to log file
+    # use setsid to start slurmdbd in a new session.
+    setsid sudo -u slurm /usr/sbin/slurmdbd -D > "$log_file" 2>&1 &
+    slurmdbd_pid=$!
+
+    # Monitor the log file for the startup message.
+    # slurmdbd rollup can take a long time. We are considering a timeout of 1 hr.
+    timeout=3600
+    elapsed=0
+    started=false
+
+    while [ $elapsed -lt $timeout ]; do
+        if grep -q "$startup_message" "$log_file" 2>/dev/null; then
+            echo "Detected slurmdbd startup message"
+            started=true
+            break
+        fi
+        sleep 1
+        elapsed=$((elapsed + 1))
+    done
+
+
+    # Only kill the foreground process if started successfully
+    if [ "$started" == "true" ]; then
+        # Kill the foreground slurmdbd process
+        if [ -n "$slurmdbd_pid" ] && kill -0 $slurmdbd_pid 2>/dev/null; then
+            echo "Stopping foreground slurmdbd process (PID: $slurmdbd_pid)"
+            kill -INT $slurmdbd_pid
+            # Wait up to 60 seconds for graceful shutdown
+            wait_timeout=60
+            while [ $wait_timeout -gt 0 ] && kill -0 $slurmdbd_pid 2>/dev/null; do
+                sleep 1
+                wait_timeout=$((wait_timeout - 1))
+            done
+
+            # Force kill if still running
+            if kill -0 $slurmdbd_pid 2>/dev/null; then
+                echo "Process did not exit gracefully, sending SIGKILL"
+                kill -9 $slurmdbd_pid 2>/dev/null
+                sleep 1
+            fi
+            echo "Foreground slurmdbd process stopped"
+        fi
+
+        run_slurmdbd_via_systemctl
+    else
+        echo "slurmdbd startup is taking long, manual intervention is required"
+    fi
+
+    # clean up the log file
+    rm -f "$log_file"
+}
+
+run_slurmctld() {
+    echo "Starting Slurmctld"
+    systemctl start slurmctld
+    attempts=3
+    delay=5
+    set +e
+    for i in $( seq 1 $attempts ); do
+        echo $i/$attempts sleeping $delay seconds before running scontrol ping
+        sleep $delay
+        scontrol ping > /dev/null 2>&1;
+        if [ $? == 0 ]; then
+            systemctl start slurmctld || exit 1
+            break
+        fi;
+    done
+    if [ $i == $attempts ] && [ $? != 0 ]; then
+        echo FATAL: slurmctld failed to start! 1>&2
+        echo Here are the last 100 lines of slurmctld.log
+        tail -n 100 /var/log/slurmctld/slurmctld.log 1>&2
+        exit 2
+    fi
+}
+
+run_slurmrestd() {
+    if [[ "$OS" == "sle_hpc" ]]; then
+        echo Warning: slurmrestd is not supported on SUSE, skipping start. 1>&2
+        exit 0
+    fi
+    monitoring_enabled=$(/opt/cycle/jetpack/bin/jetpack config cyclecloud.monitoring.enabled False)
+    systemctl start slurmrestd
+    systemctl status slurmrestd --no-pager > /dev/null
+    if [ $? != 0 ]; then
+        echo Warning: slurmrestd failed to start! 1>&2
+        /opt/cycle/jetpack/bin/jetpack log "slurmrestd failed to start" --level=warn --priority=medium
+        exit 0
+    fi
+    # start slurm_exporter if monitoring is enabled and slurmrestd is running
+    if [[ "$monitoring_enabled" == "True" ]]; then
+        run_slurm_exporter
+    fi
+}
 
 reload_prom_config(){
     # Find the Prometheus process and send SIGHUP to reload config or log a warning if not found
@@ -24,64 +145,6 @@ reload_prom_config(){
         echo "Prometheus process not found, unable to reload configuration"
     fi  
 }
-# all nodes need to have munge running
-echo restarting munge...
-systemctl restart munge
-# wait up to 60 seconds for munge to start
-iters=60
-while [ $iters -ge 0 ]; do
-    echo test | munge > /dev/null 2>&1
-    if [ $? == 0 ]; then
-        break
-    fi
-    sleep 1
-    iters=$(( $iters - 1 ))
-done
-
-# login nodes explicitly should _not_ have slurmd running.
-if [ $role == "login" ]; then
-    reload_prom_config
-    exit 0
-fi
-
-# execute nodes just need slurmd
-if [ $role == "execute" ]; then
-    systemctl start slurmd
-    reload_prom_config
-    exit 0
-fi
-
-# sanity check - make sure a valid role was actually passed in.
-# note they are defined in the slurm_*_role.rb
-if [ $role != "scheduler" ]; then
-    echo  unknown role! $role 1>&2
-    exit 2
-fi
-
-# lastly - the scheduler
-
-systemctl show slurmdbd 2>&1 > /dev/null && systemctl start slurmdbd
-# there is no obvious way to check slurmdbd status _before_ starting slurmctld
-sleep 10
-systemctl start slurmctld
-attempts=3
-delay=5
-set +e
-for i in $( seq 1 $attempts ); do
-    echo $i/$attempts sleeping $delay seconds before running scontrol ping
-    sleep $delay
-    scontrol ping
-    if [ $? == 0 ]; then
-        systemctl start slurmctld || exit 1
-        break
-    fi;
-done
-if [ $i == $attempts ] && [ $? != 0 ]; then
-    echo FATAL: slurmctld failed to start! 1>&2
-    echo Here are the last 100 lines of slurmctld.log
-    tail -n 100 /var/log/slurmctld/slurmctld.log 1>&2
-    exit 2
-fi
 
 run_slurm_exporter() {
     # Run Slurm Exporter in a container
@@ -145,22 +208,55 @@ run_slurm_exporter() {
     fi 
 }
 
-if [[ "$OS" == "sle_hpc" ]]; then
-    echo Warning: slurmrestd is not supported on SUSE, skipping start. 1>&2
-    exit 0
-fi
-
-# start slurmrestd
-sleep 10
-systemctl start slurmrestd
-systemctl status slurmrestd --no-pager > /dev/null
-if [ $? != 0 ]; then
-    echo Warning: slurmrestd failed to start! 1>&2
-    /opt/cycle/jetpack/bin/jetpack log "slurmrestd failed to start" --level=warn --priority=medium
-    exit 0
-fi
-# start slurm_exporter if monitoring is enabled and slurmrestd is running
-if [[ "$monitoring_enabled" == "True" ]]; then
-    run_slurm_exporter
-fi
-exit 0
+
+{ 
+    if [ "$1" == "" ]; then
+        echo "Usage: $0 [scheduler|execute|login]"
+        exit 1
+    fi
+
+    role=$1
+
+    OS=$(. /etc/os-release; echo $ID)
+    echo "Starting services"
+    # all nodes need to have munge running
+    echo restarting munge...
+    systemctl restart munge
+    # wait up to 60 seconds for munge to start
+    iters=60
+    while [ $iters -ge 0 ]; do
+        echo test | munge > /dev/null 2>&1
+        if [ $? == 0 ]; then
+            break
+        fi
+        sleep 1
+        iters=$(( $iters - 1 ))
+    done
+
+    # login nodes explicitly should _not_ have slurmd running.
+    if [ $role == "login" ]; then
+        reload_prom_config
+        exit 0
+    fi
+
+    # execute nodes just need slurmd
+    if [ $role == "execute" ]; then
+        systemctl start slurmd
+        reload_prom_config
+        exit 0
+    fi
+
+    # sanity check - make sure a valid role was actually passed in.
+    # note they are defined in the slurm_*_role.rb
+    if [ $role != "scheduler" ]; then
+        echo  unknown role! $role 1>&2
+        exit 2
+    fi
+
+    # lastly - the scheduler
+    run_slurmdbd
+
+    run_slurmctld
+
+    run_slurmrestd
+} 2>&1 | tee -a /var/log/azure-slurm-install.log