Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions agent/conf/agent.properties
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,22 @@ iscsi.session.cleanup.enabled=false
# This parameter specifies if the host must be rebooted when something goes wrong with the heartbeat.
#reboot.host.and.alert.management.on.heartbeat.timeout=true

# Action taken by kvmheartbeat.sh / kvmspheartbeat.sh when a storage heartbeat
# write fails persistently. Supersedes the legacy binary
# 'reboot.host.and.alert.management.on.heartbeat.timeout' when set to a non-default value.
#
Comment on lines +314 to +316
# Allowed values:
# reboot - immediate sysrq-trigger reboot (default; original behavior)
# graceful-reboot - 'systemctl reboot' instead of sysrq; allows VMs to stop cleanly
# restart-agent - restart cloudstack-agent only; running VMs are preserved
# log-only - log + alert; take no automatic action (admin must investigate)
#
# The 'graceful-reboot', 'restart-agent', and 'log-only' actions are recommended
# for setups using LINSTOR/DRBD or any local storage with replication, where
# transient I/O contention can cause a heartbeat write to time out without the
# host actually being unhealthy.
#kvm.heartbeat.fence.action=reboot

# Enables manually setting CPU's topology on KVM's VM.
#enable.manually.setting.cpu.topology.on.kvm.vm=true

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,25 @@ public class AgentProperties{
public static final Property<Boolean> REBOOT_HOST_AND_ALERT_MANAGEMENT_ON_HEARTBEAT_TIMEOUT
= new Property<>("reboot.host.and.alert.management.on.heartbeat.timeout", true);

/**
* Action taken by the KVM agent's storage heartbeat scripts (kvmheartbeat.sh / kvmspheartbeat.sh)
* when a heartbeat write fails persistently. Allowed values:
* <ul>
* <li>{@code reboot} (default) — immediate sysrq-trigger reboot; original behavior</li>
* <li>{@code graceful-reboot} — {@code systemctl reboot} instead of sysrq, lets VMs stop cleanly</li>
* <li>{@code restart-agent} — restart cloudstack-agent only; running VMs preserved</li>
* <li>{@code log-only} — log + alert, no automatic action</li>
* </ul>
* The non-default values are recommended for setups using LINSTOR/DRBD or other replicated
* local storage, where transient I/O contention can cause a heartbeat write to time out
* without the host actually being unhealthy.<br>
* Read by the heartbeat shell scripts directly from agent.properties.<br>
* Data type: String.<br>
* Default value: {@code reboot}
*/
public static final Property<String> KVM_HEARTBEAT_FENCE_ACTION
= new Property<>("kvm.heartbeat.fence.action", "reboot");

/**
* Enables manually setting CPU's topology on KVM's VM. <br>
* Data type: Boolean.<br>
Expand Down
42 changes: 37 additions & 5 deletions scripts/vm/hypervisor/kvm/kvmheartbeat.sh
Original file line number Diff line number Diff line change
Expand Up @@ -156,11 +156,43 @@ then
exit 0
elif [ "$cflag" == "1" ]
then
/usr/bin/logger -t heartbeat "kvmheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage."
sync &
sleep 5
echo b > /proc/sysrq-trigger
exit $?
# Read fence action from agent.properties (default: reboot for backward compatibility).
# Allowed values: reboot | graceful-reboot | restart-agent | log-only
AGENT_PROPS="/etc/cloudstack/agent/agent.properties"
FENCE_ACTION="reboot"
if [ -r "$AGENT_PROPS" ]; then
val=$(grep -E '^[[:space:]]*kvm\.heartbeat\.fence\.action[[:space:]]*=' "$AGENT_PROPS" | tail -n 1 | cut -d= -f2- | tr -d '[:space:]')
[ -n "$val" ] && FENCE_ACTION="$val"
fi

case "$FENCE_ACTION" in
log-only)
/usr/bin/logger -t heartbeat "kvmheartbeat.sh: heartbeat write to storage failed; fence action 'log-only' selected — taking no automatic action. Operator must investigate."
exit 0
;;
restart-agent)
/usr/bin/logger -t heartbeat "kvmheartbeat.sh: heartbeat write to storage failed; fence action 'restart-agent' — restarting cloudstack-agent (running VMs preserved)."
sync &
sleep 2
systemctl restart cloudstack-agent
exit $?
;;
graceful-reboot)
/usr/bin/logger -t heartbeat "kvmheartbeat.sh: heartbeat write to storage failed; fence action 'graceful-reboot' — rebooting via systemctl (allows running VMs to stop cleanly)."
sync &
sleep 5
systemctl reboot
exit $?
;;
reboot|*)
# Original behavior: immediate kernel-level reboot via sysrq-trigger
/usr/bin/logger -t heartbeat "kvmheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage."
sync &
sleep 5
echo b > /proc/sysrq-trigger
exit $?
;;
esac
else
write_hbLog
exit $?
Expand Down
42 changes: 37 additions & 5 deletions scripts/vm/hypervisor/kvm/kvmspheartbeat.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,41 @@ deleteVMs() {

if [ "$cflag" == "1" ]
then
/usr/bin/logger -t heartbeat "kvmspheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage."
sync &
sleep 5
echo b > /proc/sysrq-trigger
exit $?
# Read fence action from agent.properties (default: reboot for backward compatibility).
# Allowed values: reboot | graceful-reboot | restart-agent | log-only
AGENT_PROPS="/etc/cloudstack/agent/agent.properties"
FENCE_ACTION="reboot"
if [ -r "$AGENT_PROPS" ]; then
val=$(grep -E '^[[:space:]]*kvm\.heartbeat\.fence\.action[[:space:]]*=' "$AGENT_PROPS" | tail -n 1 | cut -d= -f2- | tr -d '[:space:]')
[ -n "$val" ] && FENCE_ACTION="$val"
fi

case "$FENCE_ACTION" in
log-only)
/usr/bin/logger -t heartbeat "kvmspheartbeat.sh: heartbeat write to storage failed; fence action 'log-only' selected — taking no automatic action. Operator must investigate."
exit 0
;;
restart-agent)
/usr/bin/logger -t heartbeat "kvmspheartbeat.sh: heartbeat write to storage failed; fence action 'restart-agent' — restarting cloudstack-agent (running VMs preserved)."
sync &
sleep 2
systemctl restart cloudstack-agent
exit $?
;;
graceful-reboot)
/usr/bin/logger -t heartbeat "kvmspheartbeat.sh: heartbeat write to storage failed; fence action 'graceful-reboot' — rebooting via systemctl (allows running VMs to stop cleanly)."
sync &
sleep 5
systemctl reboot
exit $?
;;
reboot|*)
# Original behavior: immediate kernel-level reboot via sysrq-trigger
/usr/bin/logger -t heartbeat "kvmspheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage."
sync &
sleep 5
echo b > /proc/sysrq-trigger
exit $?
;;
esac
fi
Loading