Skip to content

Commit 68694c4

Browse files
Fix to not cancel VM HA items when Host HA is enabled & inspection in progress, and some code improvements
- When Host HA inspection in progress, the investigor returns the Host Status as Up which cancels the VM HA items - Don't cancel the VM HA items, instead reschedule them to try again later
1 parent c3f6d76 commit 68694c4

7 files changed

Lines changed: 53 additions & 23 deletions

File tree

api/src/main/java/org/apache/cloudstack/api/command/admin/ha/ConfigureHAForHostCmd.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ private void setupResponse(final boolean result, final String resourceUuid) {
8787
final HostHAResponse response = new HostHAResponse();
8888
response.setId(resourceUuid);
8989
response.setProvider(getHaProvider().toLowerCase());
90+
response.setStatus(result);
9091
response.setResponseName(getCommandName());
9192
setResponseObject(response);
9293
}

plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ public Status getHostAgentStatus(Host host) {
7979
}
8080

8181
if (haManager.isHAEligible(host)) {
82-
return haManager.getHostStatus(host);
82+
return haManager.getHostStatusFromHAConfig(host);
8383
}
8484

8585
List<StoragePoolVO> clusterPools = _storagePoolDao.findPoolsInClusters(Collections.singletonList(host.getClusterId()), null);

scripts/vm/hypervisor/kvm/kvmheartbeat.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ fi
7575
#delete VMs on this mountpoint
7676
deleteVMs() {
7777
local mountPoint=$1
78-
vmPids=$(ps aux| grep qemu | grep "$mountPoint" | awk '{print $2}' 2> /dev/null)
78+
vmPids=$(ps aux | grep qemu | grep "$mountPoint" | awk '{print $2}' 2> /dev/null)
7979
if [ $? -gt 0 ]
8080
then
8181
return
@@ -93,7 +93,7 @@ deleteVMs() {
9393
}
9494

9595
#checking is there the same nfs server mounted under $MountPoint?
96-
mounts=$(cat /proc/mounts |grep nfs|grep $MountPoint)
96+
mounts=$(cat /proc/mounts | grep nfs | grep $MountPoint)
9797
if [ $? -gt 0 ]
9898
then
9999
# remount it

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@
4242
import org.apache.cloudstack.framework.config.ConfigKey;
4343
import org.apache.cloudstack.framework.config.Configurable;
4444
import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
45+
import org.apache.cloudstack.ha.HAConfig;
46+
import org.apache.cloudstack.ha.HAResource;
47+
import org.apache.cloudstack.ha.dao.HAConfigDao;
4548
import org.apache.cloudstack.managed.context.ManagedContext;
4649
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
4750
import org.apache.cloudstack.management.ManagementServerHost;
@@ -223,6 +226,8 @@ public void setHaPlanners(List<HAPlanner> haPlanners) {
223226
@Inject
224227
ConfigurationDao _configDao;
225228
@Inject
229+
HAConfigDao _haConfigDao;
230+
@Inject
226231
VolumeOrchestrationService volumeMgr;
227232

228233
String _instance;
@@ -237,25 +242,37 @@ public void setHaPlanners(List<HAPlanner> haPlanners) {
237242
long _timeBetweenCleanups;
238243
String _haTag = null;
239244

245+
protected HighAvailabilityManagerImpl() {
246+
}
247+
240248
private boolean vmHasPendingHAJob(final List<HaWorkVO> pendingHaWorks, final VMInstanceVO vm) {
241249
Optional<HaWorkVO> item = pendingHaWorks.stream()
242250
.filter(h -> h.getInstanceId() == vm.getId())
243251
.reduce((first, second) -> second);
244252
if (item.isPresent() && (item.get().getTimesTried() < _maxRetries ||
245253
!item.get().canScheduleNew(_timeBetweenFailures))) {
246-
logger.debug(String.format("Skipping HA on %s as there is already a running HA job for it", vm));
254+
logger.debug("Skipping HA on {} as there is already a running HA job for it", vm);
247255
return true;
248256
}
249257
return false;
250258
}
251259

252-
protected HighAvailabilityManagerImpl() {
260+
private boolean isHostHAInspectionInProgress(long hostId) {
261+
final HAConfig haConfig = _haConfigDao.findHAResource(hostId, HAResource.ResourceType.Host);
262+
if (haConfig == null || !haConfig.isEnabled()) {
263+
return false;
264+
}
265+
266+
HAConfig.HAState state = haConfig.getState();
267+
logger.debug("Checking Host HA inspection is in progress or not for the host {} from HAConfig, HA state is {}", hostId, state);
268+
return state == HAConfig.HAState.Suspect || state == HAConfig.HAState.Checking;
253269
}
254270

255271
@Override
256272
public Status investigate(final long hostId) {
257273
final HostVO host = _hostDao.findById(hostId);
258274
if (host == null) {
275+
logger.warn("Host with id {} is removed or doesn't exists.", hostId);
259276
return Status.Alert;
260277
}
261278

@@ -814,6 +831,9 @@ protected boolean checkAndCancelWorkIfNeeded(final HaWorkVO work) {
814831
if (!CancellableWorkReasonTypes.contains(work.getReasonType())) {
815832
return false;
816833
}
834+
if (isHostHAInspectionInProgress(work.getHostId())) {
835+
return false;
836+
}
817837
Status hostStatus = investigate(work.getHostId());
818838
if (!Status.Up.equals(hostStatus)) {
819839
return false;

server/src/main/java/org/apache/cloudstack/ha/HAManager.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,16 @@ public interface HAManager extends HAConfigManager {
6767
"The number of pending fence operations per management server. This setting determines the size of the size of the FENCE queue.", true);
6868

6969
boolean transitionHAState(final HAConfig.Event event, final HAConfig haConfig);
70+
7071
HAProvider getHAProvider(final String name);
72+
7173
HAResourceCounter getHACounter(final Long resourceId, final HAResource.ResourceType resourceType);
74+
7275
void purgeHACounter(final Long resourceId, final HAResource.ResourceType resourceType);
7376

7477
boolean isHAEligible(final HAResource resource);
78+
7579
Boolean isVMAliveOnHost(final Host host) throws Investigator.UnknownVM;
76-
Status getHostStatus(final Host host);
80+
81+
Status getHostStatusFromHAConfig(final Host host);
7782
}

server/src/main/java/org/apache/cloudstack/ha/HAManagerImpl.java

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ public boolean isHAEnabledForZone(final HAResource resource) {
248248
}
249249

250250
private boolean isHAEnabledForCluster(final HAResource resource) {
251+
// HA is enabled by default when cluster details doesn't exist
251252
if (resource == null || resource.getClusterId() == null) {
252253
return true;
253254
}
@@ -259,14 +260,10 @@ private boolean isHAEligibleForResource(final HAResource resource) {
259260
if (resource == null || resource.getId() < 1L) {
260261
return false;
261262
}
262-
HAResource.ResourceType resourceType = null;
263-
if (resource instanceof Host) {
264-
resourceType = HAResource.ResourceType.Host;
265-
}
266-
if (resourceType == null) {
263+
if (!(resource instanceof Host)) {
267264
return false;
268265
}
269-
final HAConfig haConfig = haConfigDao.findHAResource(resource.getId(), resourceType);
266+
final HAConfig haConfig = haConfigDao.findHAResource(resource.getId(), HAResource.ResourceType.Host);
270267
return haConfig != null && haConfig.isEnabled()
271268
&& haConfig.getState() != HAConfig.HAState.Disabled
272269
&& haConfig.getState() != HAConfig.HAState.Ineligible;
@@ -317,19 +314,23 @@ public Boolean isVMAliveOnHost(final Host host) throws Investigator.UnknownVM {
317314
throw new Investigator.UnknownVM();
318315
}
319316

320-
public Status getHostStatus(final Host host) {
317+
public Status getHostStatusFromHAConfig(final Host host) {
321318
final HAConfig haConfig = haConfigDao.findHAResource(host.getId(), HAResource.ResourceType.Host);
322-
if (haConfig != null) {
323-
if (haConfig.getState() == HAConfig.HAState.Fenced) {
324-
logger.debug("HA: Agent [{}] is available/suspect/checking Up.", host);
325-
return Status.Down;
326-
} else if (haConfig.getState() == HAConfig.HAState.Degraded || haConfig.getState() == HAConfig.HAState.Recovering || haConfig.getState() == HAConfig.HAState.Fencing) {
327-
logger.debug("HA: Agent [{}] is disconnected. State: {}, {}.", host, haConfig.getState(), haConfig.getState().getDescription());
328-
return Status.Disconnected;
329-
}
330-
return Status.Up;
319+
if (haConfig == null) {
320+
logger.warn("HA: Agent [{}] config is not available.", host);
321+
return Status.Unknown;
322+
}
323+
if (haConfig.getState() == HAConfig.HAState.Fenced) {
324+
logger.debug("HA: Agent [{}] is fenced.", host);
325+
return Status.Down;
331326
}
332-
return Status.Unknown;
327+
if (haConfig.getState() == HAConfig.HAState.Degraded || haConfig.getState() == HAConfig.HAState.Recovering || haConfig.getState() == HAConfig.HAState.Fencing) {
328+
logger.debug("HA: Agent [{}] is disconnected. State: {}, {}.", host, haConfig.getState(), haConfig.getState().getDescription());
329+
return Status.Disconnected;
330+
}
331+
332+
logger.debug("HA: Agent [{}] is considered Up (HA state can be Available/Suspect/Checking/Recovered). State: {}, {}.", host, haConfig.getState(), haConfig.getState().getDescription());
333+
return Status.Up;
333334
}
334335

335336
//////////////////////////////////////////////////////

server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
import org.apache.cloudstack.engine.subsystem.api.storage.DataStoreProviderManager;
3636
import org.apache.cloudstack.framework.config.ConfigKey;
3737
import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
38+
import org.apache.cloudstack.ha.dao.HAConfigDao;
3839
import org.apache.cloudstack.managed.context.ManagedContext;
3940
import org.apache.logging.log4j.LogManager;
4041
import org.apache.logging.log4j.Logger;
@@ -118,6 +119,8 @@ public class HighAvailabilityManagerImplTest {
118119
@Mock
119120
ConfigurationDao _configDao;
120121
@Mock
122+
HAConfigDao _haConfigDao;
123+
@Mock
121124
VolumeOrchestrationService volumeMgr;
122125
@Mock
123126
ConsoleProxyManager consoleProxyManager;

0 commit comments

Comments
 (0)