Skip to content

Commit a454e85

Browse files
Host HA code improvements
1 parent ffebe8e commit a454e85

28 files changed

Lines changed: 300 additions & 317 deletions

File tree

api/src/main/java/com/cloud/ha/Investigator.java

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,16 +27,11 @@ public interface Investigator extends Adapter {
2727
*
2828
* @param vm to work on.
2929
*/
30-
public boolean isVmAlive(VirtualMachine vm, Host host) throws UnknownVM;
30+
boolean isVmAlive(VirtualMachine vm, Host host) throws UnknownVM;
3131

32-
public Status isAgentAlive(Host agent);
32+
Status getHostAgentStatus(Host host);
3333

3434
class UnknownVM extends Exception {
35-
36-
/**
37-
*
38-
*/
3935
private static final long serialVersionUID = 1L;
40-
4136
};
4237
}

core/src/main/java/com/cloud/agent/api/CheckOnHostAnswer.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ public CheckOnHostAnswer(CheckOnHostCommand cmd, Boolean alive, String details)
3838

3939
public CheckOnHostAnswer(CheckOnHostCommand cmd, String details) {
4040
super(cmd, false, details);
41+
determined = false;
42+
alive = false;
4143
}
4244

4345
public boolean isDetermined() {
@@ -47,5 +49,4 @@ public boolean isDetermined() {
4749
public boolean isAlive() {
4850
return alive;
4951
}
50-
5152
}

core/src/main/java/com/cloud/agent/api/CheckOnHostCommand.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
public class CheckOnHostCommand extends Command {
2626
HostTO host;
27-
boolean reportCheckFailureIfOneStorageIsDown;
27+
boolean reportIfHeartBeatFailedForOneStoragePool;
2828

2929
protected CheckOnHostCommand() {
3030
}
@@ -34,17 +34,17 @@ public CheckOnHostCommand(Host host) {
3434
setWait(20);
3535
}
3636

37-
public CheckOnHostCommand(Host host, boolean reportCheckFailureIfOneStorageIsDown) {
37+
public CheckOnHostCommand(Host host, boolean reportIfHeartBeatFailedForOneStoragePool) {
3838
this(host);
39-
this.reportCheckFailureIfOneStorageIsDown = reportCheckFailureIfOneStorageIsDown;
39+
this.reportIfHeartBeatFailedForOneStoragePool = reportIfHeartBeatFailedForOneStoragePool;
4040
}
4141

4242
public HostTO getHost() {
4343
return host;
4444
}
4545

46-
public boolean isCheckFailedOnOneStorage() {
47-
return reportCheckFailureIfOneStorageIsDown;
46+
public boolean shouldReportIfHeartBeatFailedForOneStoragePool() {
47+
return reportIfHeartBeatFailedForOneStoragePool;
4848
}
4949

5050
@Override

engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,10 @@ public interface HighAvailabilityManager extends Manager {
7575
+ " which are registered for the HA event that were successful and are now ready to be purged.",
7676
true, Cluster);
7777

78-
public static final ConfigKey<Boolean> KvmHAFenceHostIfHeartbeatFailsOnStorage = new ConfigKey<>("Advanced", Boolean.class, "kvm.ha.fence.on.storage.heartbeat.failure", "false",
78+
ConfigKey<Boolean> KvmHAFenceHostIfHeartbeatFailsOnStorage = new ConfigKey<>("Advanced", Boolean.class, "kvm.ha.fence.on.storage.heartbeat.failure", "false",
7979
"Proceed fencing the host even the heartbeat failed for only one storage pool", false, ConfigKey.Scope.Zone);
8080

81-
public enum WorkType {
81+
enum WorkType {
8282
Migration, // Migrating VMs off of a host.
8383
Stop, // Stops a VM for storage pool migration purposes. This should be obsolete now.
8484
CheckStop, // Checks if a VM has been stopped.

plugins/hypervisors/hyperv/src/main/java/com/cloud/ha/HypervInvestigator.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,15 +41,15 @@ public class HypervInvestigator extends AdapterBase implements Investigator {
4141

4242
@Override
4343
public boolean isVmAlive(com.cloud.vm.VirtualMachine vm, Host host) throws UnknownVM {
44-
Status status = isAgentAlive(host);
44+
Status status = getHostAgentStatus(host);
4545
if (status == null) {
4646
throw new UnknownVM();
4747
}
48-
return status == Status.Up ? true : null;
48+
return status == Status.Up;
4949
}
5050

5151
@Override
52-
public Status isAgentAlive(Host agent) {
52+
public Status getHostAgentStatus(Host agent) {
5353
if (agent.getHypervisorType() != Hypervisor.HypervisorType.Hyperv) {
5454
return null;
5555
}

plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java

Lines changed: 21 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,7 @@
1919
package com.cloud.ha;
2020

2121
import com.cloud.agent.AgentManager;
22-
import com.cloud.agent.api.Answer;
23-
import com.cloud.agent.api.CheckOnHostCommand;
2422
import com.cloud.host.Host;
25-
import com.cloud.host.HostVO;
2623
import com.cloud.host.Status;
2724
import com.cloud.host.dao.HostDao;
2825
import com.cloud.hypervisor.Hypervisor;
@@ -34,11 +31,12 @@
3431
import org.apache.cloudstack.engine.subsystem.api.storage.DataStoreProviderManager;
3532
import org.apache.cloudstack.engine.subsystem.api.storage.PrimaryDataStoreDriver;
3633
import org.apache.cloudstack.ha.HAManager;
34+
import org.apache.cloudstack.kvm.ha.KVMHostActivityChecker;
3735
import org.apache.cloudstack.storage.datastore.db.PrimaryDataStoreDao;
3836
import org.apache.cloudstack.storage.datastore.db.StoragePoolVO;
3937

4038
import javax.inject.Inject;
41-
import java.util.Arrays;
39+
import java.util.Collections;
4240
import java.util.List;
4341

4442
public class KVMInvestigator extends AdapterBase implements Investigator {
@@ -54,13 +52,15 @@ public class KVMInvestigator extends AdapterBase implements Investigator {
5452
private HAManager haManager;
5553
@Inject
5654
private DataStoreProviderManager dataStoreProviderMgr;
55+
@Inject
56+
private KVMHostActivityChecker hostActivityChecker;
5757

5858
@Override
5959
public boolean isVmAlive(com.cloud.vm.VirtualMachine vm, Host host) throws UnknownVM {
6060
if (haManager.isHAEligible(host)) {
6161
return haManager.isVMAliveOnHost(host);
6262
}
63-
Status status = isAgentAlive(host);
63+
Status status = getHostAgentStatus(host);
6464
logger.debug("HA: HOST is ineligible legacy state {} for host {}", status, host);
6565
if (status == null) {
6666
throw new UnknownVM();
@@ -73,86 +73,41 @@ public boolean isVmAlive(com.cloud.vm.VirtualMachine vm, Host host) throws Unkno
7373
}
7474

7575
@Override
76-
public Status isAgentAlive(Host agent) {
77-
if (agent.getHypervisorType() != Hypervisor.HypervisorType.KVM && agent.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
76+
public Status getHostAgentStatus(Host host) {
77+
if (host.getHypervisorType() != Hypervisor.HypervisorType.KVM && host.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
7878
return null;
7979
}
8080

81-
if (haManager.isHAEligible(agent)) {
82-
return haManager.getHostStatus(agent);
81+
if (haManager.isHAEligible(host)) {
82+
return haManager.getHostStatus(host);
8383
}
8484

85-
List<StoragePoolVO> clusterPools = _storagePoolDao.findPoolsInClusters(Arrays.asList(agent.getClusterId()), null);
86-
boolean storageSupportHA = storageSupportHa(clusterPools);
87-
if (!storageSupportHA) {
88-
List<StoragePoolVO> zonePools = _storagePoolDao.findZoneWideStoragePoolsByHypervisor(agent.getDataCenterId(), agent.getHypervisorType());
89-
storageSupportHA = storageSupportHa(zonePools);
85+
List<StoragePoolVO> clusterPools = _storagePoolDao.findPoolsInClusters(Collections.singletonList(host.getClusterId()), null);
86+
boolean storageSupportsHA = storageSupportsHA(clusterPools);
87+
if (!storageSupportsHA) {
88+
List<StoragePoolVO> zonePools = _storagePoolDao.findZoneWideStoragePoolsByHypervisor(host.getDataCenterId(), host.getHypervisorType());
89+
storageSupportsHA = storageSupportsHA(zonePools);
9090
}
91-
if (!storageSupportHA) {
92-
logger.warn("Agent investigation was requested on host {}, but host does not support investigation because it has no NFS storage. Skipping investigation.", agent);
91+
if (!storageSupportsHA) {
92+
logger.warn("Agent investigation was requested on host {}, but host does not support investigation" +
93+
" because it has no HA supported storage. Skipping investigation.", host);
9394
return null;
9495
}
9596

96-
Status hostStatus = null;
97-
Status neighbourStatus = null;
98-
boolean reportFailureIfOneStorageIsDown = HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value();
99-
CheckOnHostCommand cmd = new CheckOnHostCommand(agent, reportFailureIfOneStorageIsDown);
100-
101-
try {
102-
Answer answer = _agentMgr.easySend(agent.getId(), cmd);
103-
if (answer != null) {
104-
hostStatus = answer.getResult() ? Status.Down : Status.Up;
105-
}
106-
} catch (Exception e) {
107-
logger.debug("Failed to send command to host: {}", agent);
108-
}
109-
if (hostStatus == null) {
110-
hostStatus = Status.Disconnected;
111-
}
112-
113-
List<HostVO> neighbors = _resourceMgr.listHostsInClusterByStatus(agent.getClusterId(), Status.Up);
114-
for (HostVO neighbor : neighbors) {
115-
if (neighbor.getId() == agent.getId()
116-
|| (neighbor.getHypervisorType() != Hypervisor.HypervisorType.KVM && neighbor.getHypervisorType() != Hypervisor.HypervisorType.LXC)) {
117-
continue;
118-
}
119-
logger.debug("Investigating host:{} via neighbouring host:{}", agent, neighbor);
120-
try {
121-
Answer answer = _agentMgr.easySend(neighbor.getId(), cmd);
122-
if (answer != null) {
123-
neighbourStatus = answer.getResult() ? Status.Down : Status.Up;
124-
logger.debug("Neighbouring host:{} returned status:{} for the investigated host:{}", neighbor, neighbourStatus, agent);
125-
if (neighbourStatus == Status.Up) {
126-
break;
127-
}
128-
}
129-
} catch (Exception e) {
130-
logger.debug("Failed to send command to host: {}", neighbor);
131-
}
132-
}
133-
if (neighbourStatus == Status.Up && (hostStatus == Status.Disconnected || hostStatus == Status.Down)) {
134-
hostStatus = Status.Disconnected;
135-
}
136-
if (neighbourStatus == Status.Down && (hostStatus == Status.Disconnected || hostStatus == Status.Down)) {
137-
hostStatus = Status.Down;
138-
}
139-
logger.debug("HA: HOST is ineligible legacy state {} for host {}", hostStatus, agent);
140-
return hostStatus;
97+
return hostActivityChecker.getHostAgentStatus(host);
14198
}
14299

143-
private boolean storageSupportHa(List<StoragePoolVO> pools) {
144-
boolean storageSupportHA = false;
100+
private boolean storageSupportsHA(List<StoragePoolVO> pools) {
145101
for (StoragePoolVO pool : pools) {
146102
DataStoreProvider storeProvider = dataStoreProviderMgr.getDataStoreProvider(pool.getStorageProviderName());
147103
DataStoreDriver storeDriver = storeProvider.getDataStoreDriver();
148104
if (storeDriver instanceof PrimaryDataStoreDriver) {
149105
PrimaryDataStoreDriver primaryStoreDriver = (PrimaryDataStoreDriver)storeDriver;
150106
if (primaryStoreDriver.isStorageSupportHA(pool.getPoolType())) {
151-
storageSupportHA = true;
152-
break;
107+
return true;
153108
}
154109
}
155110
}
156-
return storageSupportHA;
111+
return false;
157112
}
158113
}

plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHABase.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,9 @@
3535
public class KVMHABase {
3636
protected Logger logger = LogManager.getLogger(getClass());
3737
private long _timeout = 60000; /* 1 minutes */
38-
protected long _heartBeatUpdateTimeout = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.HEARTBEAT_UPDATE_TIMEOUT);
39-
protected long _heartBeatUpdateFreq = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_FREQUENCY);
38+
protected long _heartBeatUpdateFreqInMs = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_FREQUENCY);
4039
protected long _heartBeatUpdateMaxTries = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_MAX_TRIES);
41-
protected long _heartBeatUpdateRetrySleep = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_RETRY_SLEEP);
40+
protected long _heartBeatUpdateRetrySleepInMs = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_RETRY_SLEEP);
4241

4342
public static enum PoolType {
4443
PrimaryStorage, SecondaryStorage
@@ -234,7 +233,7 @@ protected String runScriptRetry(String cmdString, OutputInterpreter interpreter)
234233
return result;
235234
}
236235

237-
public Boolean checkingHeartBeat() {
236+
public Boolean hasHeartBeat() {
238237
// TODO Auto-generated method stub
239238
return null;
240239
}

plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAChecker.java

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -26,44 +26,42 @@
2626
public class KVMHAChecker extends KVMHABase implements Callable<Boolean> {
2727
private List<HAStoragePool> storagePools;
2828
private HostTO host;
29-
private boolean reportFailureIfOneStorageIsDown;
29+
private boolean reportIfHeartBeatFailedForOneStoragePool;
3030

31-
public KVMHAChecker(List<HAStoragePool> pools, HostTO host, boolean reportFailureIfOneStorageIsDown) {
31+
public KVMHAChecker(List<HAStoragePool> pools, HostTO host, boolean reportIfHeartBeatFailedForOneStoragePool) {
3232
this.storagePools = pools;
3333
this.host = host;
34-
this.reportFailureIfOneStorageIsDown = reportFailureIfOneStorageIsDown;
34+
this.reportIfHeartBeatFailedForOneStoragePool = reportIfHeartBeatFailedForOneStoragePool;
3535
}
3636

3737
/*
38-
* True means heartbeaing is on going, or we can't get it's status. False
39-
* means heartbeating is stopped definitely
38+
* True means heart beating is on going, or we can't get it's status.
39+
* False means heart beating is stopped definitely.
4040
*/
4141
@Override
42-
public Boolean checkingHeartBeat() {
43-
boolean validResult = false;
44-
42+
public Boolean hasHeartBeat() {
4543
String hostAndPools = String.format("host IP [%s] in pools [%s]", host.getPrivateNetwork().getIp(), storagePools.stream().map(pool -> pool.getPoolUUID()).collect(Collectors.joining(", ")));
44+
logger.debug("Checking heart beat with KVMHAChecker for {}", hostAndPools);
4645

47-
logger.debug(String.format("Checking heart beat with KVMHAChecker for %s", hostAndPools));
48-
46+
boolean heartBeatCheckResult = false;
4947
for (HAStoragePool pool : storagePools) {
50-
validResult = pool.getPool().checkingHeartBeat(pool, host);
51-
if (reportFailureIfOneStorageIsDown && !validResult) {
48+
heartBeatCheckResult = pool.getPool().hasHeartBeat(pool, host);
49+
if (reportIfHeartBeatFailedForOneStoragePool && !heartBeatCheckResult) {
5250
break;
5351
}
5452
}
5553

56-
if (!validResult) {
57-
logger.warn(String.format("All checks with KVMHAChecker for %s considered it as dead. It may cause a shutdown of the host.", hostAndPools));
54+
if (!heartBeatCheckResult) {
55+
logger.warn("All checks with KVMHAChecker for {} considered it as dead. It may cause a shutdown of the host.", hostAndPools);
5856
}
5957

60-
return validResult;
58+
return heartBeatCheckResult;
6159
}
6260

6361
@Override
6462
public Boolean call() throws Exception {
6563
// logger.addAppender(new org.apache.log4j.ConsoleAppender(new
6664
// org.apache.log4j.PatternLayout(), "System.out"));
67-
return checkingHeartBeat();
65+
return hasHeartBeat();
6866
}
6967
}

plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ protected void runHeartBeat() {
9191
result = executePoolHeartBeatCommand(uuid, primaryStoragePool, result);
9292

9393
if (result != null && rebootHostAndAlertManagementOnHeartbeatTimeout) {
94-
logger.warn(String.format("Write heartbeat for pool [%s] failed: %s; stopping cloudstack-agent.", uuid, result));
94+
logger.warn("Write heartbeat for pool [{}] failed: {}; stopping cloudstack-agent.", uuid, result);
9595
primaryStoragePool.getPool().createHeartBeatCommand(primaryStoragePool, null, false);;
9696
}
9797
}
@@ -108,9 +108,9 @@ private String executePoolHeartBeatCommand(String uuid, HAStoragePool primarySto
108108
result = primaryStoragePool.getPool().createHeartBeatCommand(primaryStoragePool, hostPrivateIp, true);
109109

110110
if (result != null) {
111-
logger.warn(String.format("Write heartbeat for pool [%s] failed: %s; try: %s of %s.", uuid, result, i, _heartBeatUpdateMaxTries));
111+
logger.warn("Write heartbeat for pool [{}] failed: {}; try: {} of {}.", uuid, result, i, _heartBeatUpdateMaxTries);
112112
try {
113-
Thread.sleep(_heartBeatUpdateRetrySleep);
113+
Thread.sleep(_heartBeatUpdateRetrySleepInMs);
114114
} catch (InterruptedException e) {
115115
logger.debug("[IGNORED] Interrupted between heartbeat retries.", e);
116116
}
@@ -128,21 +128,21 @@ private void checkForNotExistingLibvirtStoragePools(Set<String> removedPools, St
128128
StoragePool storage = conn.storagePoolLookupByUUIDString(uuid);
129129
if (storage == null || storage.getInfo().state != StoragePoolState.VIR_STORAGE_POOL_RUNNING) {
130130
if (storage == null) {
131-
logger.debug(String.format("Libvirt storage pool [%s] not found, removing from HA list.", uuid));
131+
logger.debug("Libvirt storage pool [{}] not found, removing from HA list.", uuid);
132132
} else {
133-
logger.debug(String.format("Libvirt storage pool [%s] found, but not running, removing from HA list.", uuid));
133+
logger.debug("Libvirt storage pool [{}] found, but not running, removing from HA list.", uuid);
134134
}
135135

136136
removedPools.add(uuid);
137137
}
138138

139-
logger.debug(String.format("Found NFS storage pool [%s] in libvirt, continuing.", uuid));
139+
logger.debug("Found NFS storage pool [{}] in libvirt, continuing.", uuid);
140140

141141
} catch (LibvirtException e) {
142-
logger.debug(String.format("Failed to lookup libvirt storage pool [%s].", uuid), e);
142+
logger.debug("Failed to lookup libvirt storage pool [{}].", uuid, e);
143143

144144
if (e.toString().contains("pool not found")) {
145-
logger.debug(String.format("Removing pool [%s] from HA monitor since it was deleted.", uuid));
145+
logger.debug("Removing pool [{}] from HA monitor since it was deleted.", uuid);
146146
removedPools.add(uuid);
147147
}
148148
}
@@ -155,11 +155,10 @@ public void run() {
155155
runHeartBeat();
156156

157157
try {
158-
Thread.sleep(_heartBeatUpdateFreq);
158+
Thread.sleep(_heartBeatUpdateFreqInMs);
159159
} catch (InterruptedException e) {
160160
logger.debug("[IGNORED] Interrupted between heartbeats.", e);
161161
}
162162
}
163163
}
164-
165164
}

plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAVMActivityChecker.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,12 @@ public KVMHAVMActivityChecker(final HAStoragePool pool, final HostTO host, final
3939
}
4040

4141
@Override
42-
public Boolean checkingHeartBeat() {
43-
return this.storagePool.getPool().vmActivityCheck(storagePool, host, activityScriptTimeout, volumeUuidList, vmActivityCheckPath, suspectTimeInSeconds);
42+
public Boolean hasHeartBeat() {
43+
return this.storagePool.getPool().hasVmActivity(storagePool, host, activityScriptTimeout, volumeUuidList, vmActivityCheckPath, suspectTimeInSeconds);
4444
}
4545

4646
@Override
4747
public Boolean call() throws Exception {
48-
return checkingHeartBeat();
48+
return hasHeartBeat();
4949
}
5050
}

0 commit comments

Comments
 (0)